{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999708043404214, "eval_steps": 50, "global_step": 12844, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.785509220962483e-05, "grad_norm": 1347.3993625908233, "learning_rate": 1.556420233463035e-07, "loss": 74.7054, "step": 1 }, { "epoch": 0.00015571018441924966, "grad_norm": 1456.2255847949084, "learning_rate": 3.11284046692607e-07, "loss": 74.2679, "step": 2 }, { "epoch": 0.00023356527662887452, "grad_norm": 1434.6830429191543, "learning_rate": 4.6692607003891057e-07, "loss": 75.3661, "step": 3 }, { "epoch": 0.0003114203688384993, "grad_norm": 1364.221924970002, "learning_rate": 6.22568093385214e-07, "loss": 75.125, "step": 4 }, { "epoch": 0.0003892754610481242, "grad_norm": 1320.7286956815276, "learning_rate": 7.782101167315175e-07, "loss": 75.6339, "step": 5 }, { "epoch": 0.00046713055325774903, "grad_norm": 1292.4938623246185, "learning_rate": 9.338521400778211e-07, "loss": 75.5446, "step": 6 }, { "epoch": 0.0005449856454673739, "grad_norm": 1329.2817127314286, "learning_rate": 1.0894941634241247e-06, "loss": 72.6786, "step": 7 }, { "epoch": 0.0006228407376769986, "grad_norm": 1256.7271185649217, "learning_rate": 1.245136186770428e-06, "loss": 72.6786, "step": 8 }, { "epoch": 0.0007006958298866235, "grad_norm": 984.3481716858829, "learning_rate": 1.4007782101167317e-06, "loss": 66.7857, "step": 9 }, { "epoch": 0.0007785509220962483, "grad_norm": 1006.9383053226463, "learning_rate": 1.556420233463035e-06, "loss": 67.1741, "step": 10 }, { "epoch": 0.0008564060143058732, "grad_norm": 940.9569562262595, "learning_rate": 1.7120622568093387e-06, "loss": 65.0938, "step": 11 }, { "epoch": 0.0009342611065154981, "grad_norm": 532.3313918848611, "learning_rate": 1.8677042801556423e-06, "loss": 57.8348, "step": 12 }, { "epoch": 0.001012116198725123, "grad_norm": 493.44705218043737, "learning_rate": 2.023346303501946e-06, "loss": 57.5625, "step": 13 }, { "epoch": 0.0010899712909347478, "grad_norm": 477.0020504659137, "learning_rate": 2.1789883268482493e-06, "loss": 56.4196, "step": 14 }, { "epoch": 0.0011678263831443726, "grad_norm": 466.30307267527184, "learning_rate": 2.3346303501945527e-06, "loss": 55.7679, "step": 15 }, { "epoch": 0.0012456814753539973, "grad_norm": 307.94374667219296, "learning_rate": 2.490272373540856e-06, "loss": 50.3214, "step": 16 }, { "epoch": 0.0013235365675636221, "grad_norm": 305.8217372914769, "learning_rate": 2.64591439688716e-06, "loss": 49.4821, "step": 17 }, { "epoch": 0.001401391659773247, "grad_norm": 249.09737916696668, "learning_rate": 2.8015564202334633e-06, "loss": 49.0089, "step": 18 }, { "epoch": 0.0014792467519828718, "grad_norm": 226.90041041064634, "learning_rate": 2.9571984435797667e-06, "loss": 47.2009, "step": 19 }, { "epoch": 0.0015571018441924967, "grad_norm": 170.59990424471707, "learning_rate": 3.11284046692607e-06, "loss": 46.5134, "step": 20 }, { "epoch": 0.0016349569364021216, "grad_norm": 140.75929699200375, "learning_rate": 3.268482490272374e-06, "loss": 44.2634, "step": 21 }, { "epoch": 0.0017128120286117464, "grad_norm": 163.93875406336684, "learning_rate": 3.4241245136186774e-06, "loss": 42.1205, "step": 22 }, { "epoch": 0.0017906671208213713, "grad_norm": 105.44455098103937, "learning_rate": 3.5797665369649808e-06, "loss": 41.0446, "step": 23 }, { "epoch": 0.0018685222130309961, "grad_norm": 114.02074324562476, "learning_rate": 3.7354085603112846e-06, "loss": 39.4286, "step": 24 }, { "epoch": 0.001946377305240621, "grad_norm": 82.0617664458467, "learning_rate": 3.891050583657588e-06, "loss": 38.9777, "step": 25 }, { "epoch": 0.002024232397450246, "grad_norm": 96.65166903920715, "learning_rate": 4.046692607003892e-06, "loss": 37.6607, "step": 26 }, { "epoch": 0.0021020874896598707, "grad_norm": 67.79346687474086, "learning_rate": 4.202334630350195e-06, "loss": 35.7321, "step": 27 }, { "epoch": 0.0021799425818694955, "grad_norm": 83.61482378254259, "learning_rate": 4.357976653696499e-06, "loss": 34.8839, "step": 28 }, { "epoch": 0.0022577976740791204, "grad_norm": 148.86173352946744, "learning_rate": 4.513618677042802e-06, "loss": 33.6094, "step": 29 }, { "epoch": 0.0023356527662887453, "grad_norm": 86.92369972434973, "learning_rate": 4.669260700389105e-06, "loss": 32.2121, "step": 30 }, { "epoch": 0.00241350785849837, "grad_norm": 64.51338055621417, "learning_rate": 4.824902723735409e-06, "loss": 31.0357, "step": 31 }, { "epoch": 0.0024913629507079945, "grad_norm": 99.8898053695027, "learning_rate": 4.980544747081712e-06, "loss": 30.2232, "step": 32 }, { "epoch": 0.0025692180429176194, "grad_norm": 79.6115636407728, "learning_rate": 5.136186770428015e-06, "loss": 29.567, "step": 33 }, { "epoch": 0.0026470731351272443, "grad_norm": 202.56745953873127, "learning_rate": 5.29182879377432e-06, "loss": 28.4821, "step": 34 }, { "epoch": 0.002724928227336869, "grad_norm": 75.13349927641224, "learning_rate": 5.447470817120623e-06, "loss": 27.846, "step": 35 }, { "epoch": 0.002802783319546494, "grad_norm": 163.8540716017275, "learning_rate": 5.603112840466927e-06, "loss": 28.4688, "step": 36 }, { "epoch": 0.002880638411756119, "grad_norm": 109.88370531743395, "learning_rate": 5.75875486381323e-06, "loss": 27.4152, "step": 37 }, { "epoch": 0.0029584935039657437, "grad_norm": 142.2530821584772, "learning_rate": 5.9143968871595335e-06, "loss": 27.0915, "step": 38 }, { "epoch": 0.0030363485961753685, "grad_norm": 235.28582999017965, "learning_rate": 6.0700389105058364e-06, "loss": 26.5379, "step": 39 }, { "epoch": 0.0031142036883849934, "grad_norm": 64.24077619231144, "learning_rate": 6.22568093385214e-06, "loss": 25.6228, "step": 40 }, { "epoch": 0.0031920587805946182, "grad_norm": 66.61296340539094, "learning_rate": 6.381322957198443e-06, "loss": 25.0179, "step": 41 }, { "epoch": 0.003269913872804243, "grad_norm": 81.03768786819938, "learning_rate": 6.536964980544748e-06, "loss": 24.7545, "step": 42 }, { "epoch": 0.003347768965013868, "grad_norm": 94.89883014607479, "learning_rate": 6.692607003891051e-06, "loss": 24.4375, "step": 43 }, { "epoch": 0.003425624057223493, "grad_norm": 47.93648866929326, "learning_rate": 6.848249027237355e-06, "loss": 23.2098, "step": 44 }, { "epoch": 0.0035034791494331177, "grad_norm": 229.8358274745738, "learning_rate": 7.003891050583658e-06, "loss": 24.3415, "step": 45 }, { "epoch": 0.0035813342416427425, "grad_norm": 75.51215293924854, "learning_rate": 7.1595330739299615e-06, "loss": 22.8817, "step": 46 }, { "epoch": 0.0036591893338523674, "grad_norm": 110.1085445476416, "learning_rate": 7.3151750972762645e-06, "loss": 23.2991, "step": 47 }, { "epoch": 0.0037370444260619922, "grad_norm": 85.407295320185, "learning_rate": 7.470817120622569e-06, "loss": 22.3237, "step": 48 }, { "epoch": 0.003814899518271617, "grad_norm": 162.34623300359343, "learning_rate": 7.626459143968872e-06, "loss": 22.5982, "step": 49 }, { "epoch": 0.003892754610481242, "grad_norm": 73.21669510892191, "learning_rate": 7.782101167315176e-06, "loss": 21.6696, "step": 50 }, { "epoch": 0.003892754610481242, "eval_loss": 2.780122995376587, "eval_runtime": 162.0753, "eval_samples_per_second": 17.77, "eval_steps_per_second": 0.636, "step": 50 }, { "epoch": 0.003970609702690866, "grad_norm": 133.46789402618495, "learning_rate": 7.937743190661478e-06, "loss": 21.9799, "step": 51 }, { "epoch": 0.004048464794900492, "grad_norm": 89.64270844306294, "learning_rate": 8.093385214007784e-06, "loss": 21.3884, "step": 52 }, { "epoch": 0.004126319887110116, "grad_norm": 105.41235016615485, "learning_rate": 8.249027237354086e-06, "loss": 21.154, "step": 53 }, { "epoch": 0.004204174979319741, "grad_norm": 143.47660778077523, "learning_rate": 8.40466926070039e-06, "loss": 21.0513, "step": 54 }, { "epoch": 0.004282030071529366, "grad_norm": 57.56218341134837, "learning_rate": 8.560311284046693e-06, "loss": 20.6964, "step": 55 }, { "epoch": 0.004359885163738991, "grad_norm": 84.46158755115093, "learning_rate": 8.715953307392997e-06, "loss": 20.4196, "step": 56 }, { "epoch": 0.0044377402559486155, "grad_norm": 72.46104308672913, "learning_rate": 8.8715953307393e-06, "loss": 19.8348, "step": 57 }, { "epoch": 0.004515595348158241, "grad_norm": 93.19548520345936, "learning_rate": 9.027237354085603e-06, "loss": 19.5268, "step": 58 }, { "epoch": 0.004593450440367865, "grad_norm": 53.71424587655392, "learning_rate": 9.182879377431907e-06, "loss": 19.2991, "step": 59 }, { "epoch": 0.0046713055325774905, "grad_norm": 64.88588033263832, "learning_rate": 9.33852140077821e-06, "loss": 19.0781, "step": 60 }, { "epoch": 0.004749160624787115, "grad_norm": 45.97669057131462, "learning_rate": 9.494163424124515e-06, "loss": 18.6674, "step": 61 }, { "epoch": 0.00482701571699674, "grad_norm": 46.46353064297449, "learning_rate": 9.649805447470818e-06, "loss": 18.0067, "step": 62 }, { "epoch": 0.004904870809206365, "grad_norm": 33.704345196791316, "learning_rate": 9.80544747081712e-06, "loss": 17.8002, "step": 63 }, { "epoch": 0.004982725901415989, "grad_norm": 32.77684319951011, "learning_rate": 9.961089494163424e-06, "loss": 17.3449, "step": 64 }, { "epoch": 0.005060580993625614, "grad_norm": 44.39942346044534, "learning_rate": 1.0116731517509728e-05, "loss": 16.9464, "step": 65 }, { "epoch": 0.005138436085835239, "grad_norm": 57.24386953932499, "learning_rate": 1.027237354085603e-05, "loss": 16.8527, "step": 66 }, { "epoch": 0.005216291178044864, "grad_norm": 60.24308272401832, "learning_rate": 1.0428015564202336e-05, "loss": 16.4286, "step": 67 }, { "epoch": 0.0052941462702544885, "grad_norm": 37.449492169609975, "learning_rate": 1.058365758754864e-05, "loss": 16.0703, "step": 68 }, { "epoch": 0.005372001362464114, "grad_norm": 37.113781428630325, "learning_rate": 1.0739299610894942e-05, "loss": 16.0737, "step": 69 }, { "epoch": 0.005449856454673738, "grad_norm": 95.39103756079444, "learning_rate": 1.0894941634241246e-05, "loss": 16.2243, "step": 70 }, { "epoch": 0.0055277115468833635, "grad_norm": 38.669766174679935, "learning_rate": 1.105058365758755e-05, "loss": 15.1964, "step": 71 }, { "epoch": 0.005605566639092988, "grad_norm": 38.89355109139913, "learning_rate": 1.1206225680933853e-05, "loss": 15.2076, "step": 72 }, { "epoch": 0.005683421731302613, "grad_norm": 62.940593755117206, "learning_rate": 1.1361867704280155e-05, "loss": 15.0268, "step": 73 }, { "epoch": 0.005761276823512238, "grad_norm": 35.34662035456866, "learning_rate": 1.151750972762646e-05, "loss": 14.6987, "step": 74 }, { "epoch": 0.005839131915721863, "grad_norm": 35.70616821036547, "learning_rate": 1.1673151750972765e-05, "loss": 14.596, "step": 75 }, { "epoch": 0.005916987007931487, "grad_norm": 41.24625806444502, "learning_rate": 1.1828793774319067e-05, "loss": 14.221, "step": 76 }, { "epoch": 0.005994842100141113, "grad_norm": 41.634514293011854, "learning_rate": 1.198443579766537e-05, "loss": 14.2935, "step": 77 }, { "epoch": 0.006072697192350737, "grad_norm": 35.399612238673676, "learning_rate": 1.2140077821011673e-05, "loss": 14.2355, "step": 78 }, { "epoch": 0.006150552284560362, "grad_norm": 52.10697000203984, "learning_rate": 1.2295719844357978e-05, "loss": 14.1696, "step": 79 }, { "epoch": 0.006228407376769987, "grad_norm": 29.9112370424319, "learning_rate": 1.245136186770428e-05, "loss": 14.1551, "step": 80 }, { "epoch": 0.006306262468979612, "grad_norm": 68.50904919444893, "learning_rate": 1.2607003891050584e-05, "loss": 14.1027, "step": 81 }, { "epoch": 0.0063841175611892365, "grad_norm": 28.357349475675925, "learning_rate": 1.2762645914396887e-05, "loss": 13.6864, "step": 82 }, { "epoch": 0.006461972653398862, "grad_norm": 60.3678766214953, "learning_rate": 1.2918287937743192e-05, "loss": 13.6272, "step": 83 }, { "epoch": 0.006539827745608486, "grad_norm": 32.88921773408999, "learning_rate": 1.3073929961089496e-05, "loss": 13.2958, "step": 84 }, { "epoch": 0.006617682837818111, "grad_norm": 24.18605294612885, "learning_rate": 1.3229571984435798e-05, "loss": 13.3549, "step": 85 }, { "epoch": 0.006695537930027736, "grad_norm": 35.68821172449771, "learning_rate": 1.3385214007782102e-05, "loss": 13.5067, "step": 86 }, { "epoch": 0.00677339302223736, "grad_norm": 50.13356759750823, "learning_rate": 1.3540856031128407e-05, "loss": 13.3806, "step": 87 }, { "epoch": 0.006851248114446986, "grad_norm": 26.405635843369584, "learning_rate": 1.369649805447471e-05, "loss": 12.9621, "step": 88 }, { "epoch": 0.00692910320665661, "grad_norm": 34.08317734392491, "learning_rate": 1.3852140077821013e-05, "loss": 13.0268, "step": 89 }, { "epoch": 0.007006958298866235, "grad_norm": 22.072059459982867, "learning_rate": 1.4007782101167315e-05, "loss": 13.0179, "step": 90 }, { "epoch": 0.00708481339107586, "grad_norm": 29.579534839823822, "learning_rate": 1.4163424124513621e-05, "loss": 13.1395, "step": 91 }, { "epoch": 0.007162668483285485, "grad_norm": 34.7344958621372, "learning_rate": 1.4319066147859923e-05, "loss": 12.7221, "step": 92 }, { "epoch": 0.0072405235754951095, "grad_norm": 46.91483521203548, "learning_rate": 1.4474708171206227e-05, "loss": 12.769, "step": 93 }, { "epoch": 0.007318378667704735, "grad_norm": 35.51316838079691, "learning_rate": 1.4630350194552529e-05, "loss": 12.529, "step": 94 }, { "epoch": 0.007396233759914359, "grad_norm": 40.53529744898346, "learning_rate": 1.4785992217898835e-05, "loss": 12.7757, "step": 95 }, { "epoch": 0.0074740888521239845, "grad_norm": 36.717327009505325, "learning_rate": 1.4941634241245138e-05, "loss": 12.4107, "step": 96 }, { "epoch": 0.007551943944333609, "grad_norm": 42.66641442203132, "learning_rate": 1.509727626459144e-05, "loss": 12.317, "step": 97 }, { "epoch": 0.007629799036543234, "grad_norm": 28.950556781433754, "learning_rate": 1.5252918287937744e-05, "loss": 12.1217, "step": 98 }, { "epoch": 0.007707654128752859, "grad_norm": 23.88804214778523, "learning_rate": 1.540856031128405e-05, "loss": 12.0703, "step": 99 }, { "epoch": 0.007785509220962484, "grad_norm": 32.7631683603279, "learning_rate": 1.5564202334630352e-05, "loss": 11.9375, "step": 100 }, { "epoch": 0.007785509220962484, "eval_loss": 1.4913856983184814, "eval_runtime": 161.8838, "eval_samples_per_second": 17.791, "eval_steps_per_second": 0.636, "step": 100 }, { "epoch": 0.00786336431317211, "grad_norm": 45.790389131157724, "learning_rate": 1.5719844357976654e-05, "loss": 11.9587, "step": 101 }, { "epoch": 0.007941219405381733, "grad_norm": 26.398399313120933, "learning_rate": 1.5875486381322956e-05, "loss": 11.7567, "step": 102 }, { "epoch": 0.008019074497591358, "grad_norm": 17.320324597122053, "learning_rate": 1.6031128404669262e-05, "loss": 11.4833, "step": 103 }, { "epoch": 0.008096929589800983, "grad_norm": 41.133396671424876, "learning_rate": 1.6186770428015567e-05, "loss": 11.5067, "step": 104 }, { "epoch": 0.008174784682010609, "grad_norm": 37.41563077367548, "learning_rate": 1.634241245136187e-05, "loss": 11.5379, "step": 105 }, { "epoch": 0.008252639774220232, "grad_norm": 33.98821157419034, "learning_rate": 1.649805447470817e-05, "loss": 11.3806, "step": 106 }, { "epoch": 0.008330494866429857, "grad_norm": 25.65195123595081, "learning_rate": 1.6653696498054477e-05, "loss": 11.0379, "step": 107 }, { "epoch": 0.008408349958639483, "grad_norm": 32.83961245860061, "learning_rate": 1.680933852140078e-05, "loss": 10.875, "step": 108 }, { "epoch": 0.008486205050849106, "grad_norm": 34.80931273466421, "learning_rate": 1.696498054474708e-05, "loss": 10.8661, "step": 109 }, { "epoch": 0.008564060143058732, "grad_norm": 29.620541058570016, "learning_rate": 1.7120622568093387e-05, "loss": 10.7846, "step": 110 }, { "epoch": 0.008641915235268357, "grad_norm": 24.935340753482397, "learning_rate": 1.7276264591439692e-05, "loss": 10.6384, "step": 111 }, { "epoch": 0.008719770327477982, "grad_norm": 35.23511695724971, "learning_rate": 1.7431906614785994e-05, "loss": 10.6596, "step": 112 }, { "epoch": 0.008797625419687606, "grad_norm": 29.88600828063079, "learning_rate": 1.7587548638132297e-05, "loss": 10.7533, "step": 113 }, { "epoch": 0.008875480511897231, "grad_norm": 23.710410278172276, "learning_rate": 1.77431906614786e-05, "loss": 10.2388, "step": 114 }, { "epoch": 0.008953335604106856, "grad_norm": 37.482219463231985, "learning_rate": 1.7898832684824904e-05, "loss": 10.2031, "step": 115 }, { "epoch": 0.009031190696316482, "grad_norm": 30.38221216412258, "learning_rate": 1.8054474708171206e-05, "loss": 10.471, "step": 116 }, { "epoch": 0.009109045788526105, "grad_norm": 23.650355909580682, "learning_rate": 1.8210116731517512e-05, "loss": 10.1819, "step": 117 }, { "epoch": 0.00918690088073573, "grad_norm": 35.289776142973494, "learning_rate": 1.8365758754863814e-05, "loss": 9.9665, "step": 118 }, { "epoch": 0.009264755972945356, "grad_norm": 27.471808245646482, "learning_rate": 1.852140077821012e-05, "loss": 10.1395, "step": 119 }, { "epoch": 0.009342611065154981, "grad_norm": 20.810217536202178, "learning_rate": 1.867704280155642e-05, "loss": 9.7985, "step": 120 }, { "epoch": 0.009420466157364605, "grad_norm": 23.711747288475657, "learning_rate": 1.8832684824902724e-05, "loss": 9.5033, "step": 121 }, { "epoch": 0.00949832124957423, "grad_norm": 24.26199458770064, "learning_rate": 1.898832684824903e-05, "loss": 9.7595, "step": 122 }, { "epoch": 0.009576176341783855, "grad_norm": 38.24881764284132, "learning_rate": 1.914396887159533e-05, "loss": 9.8532, "step": 123 }, { "epoch": 0.00965403143399348, "grad_norm": 35.98519732223564, "learning_rate": 1.9299610894941637e-05, "loss": 9.8638, "step": 124 }, { "epoch": 0.009731886526203104, "grad_norm": 23.98799118109223, "learning_rate": 1.945525291828794e-05, "loss": 9.596, "step": 125 }, { "epoch": 0.00980974161841273, "grad_norm": 39.75933272675689, "learning_rate": 1.961089494163424e-05, "loss": 9.5876, "step": 126 }, { "epoch": 0.009887596710622355, "grad_norm": 32.99122426961252, "learning_rate": 1.9766536964980547e-05, "loss": 9.6629, "step": 127 }, { "epoch": 0.009965451802831978, "grad_norm": 20.94460299942558, "learning_rate": 1.992217898832685e-05, "loss": 9.2539, "step": 128 }, { "epoch": 0.010043306895041603, "grad_norm": 29.617796770929594, "learning_rate": 2.0077821011673154e-05, "loss": 9.4247, "step": 129 }, { "epoch": 0.010121161987251229, "grad_norm": 38.126649525975374, "learning_rate": 2.0233463035019457e-05, "loss": 9.3382, "step": 130 }, { "epoch": 0.010199017079460854, "grad_norm": 29.811410998797797, "learning_rate": 2.038910505836576e-05, "loss": 9.3627, "step": 131 }, { "epoch": 0.010276872171670478, "grad_norm": 19.430608958627378, "learning_rate": 2.054474708171206e-05, "loss": 8.9598, "step": 132 }, { "epoch": 0.010354727263880103, "grad_norm": 33.950636533659655, "learning_rate": 2.070038910505837e-05, "loss": 9.24, "step": 133 }, { "epoch": 0.010432582356089728, "grad_norm": 29.303727903430108, "learning_rate": 2.0856031128404672e-05, "loss": 9.1663, "step": 134 }, { "epoch": 0.010510437448299353, "grad_norm": 21.411156212646112, "learning_rate": 2.1011673151750974e-05, "loss": 9.0363, "step": 135 }, { "epoch": 0.010588292540508977, "grad_norm": 20.6123276018682, "learning_rate": 2.116731517509728e-05, "loss": 9.0033, "step": 136 }, { "epoch": 0.010666147632718602, "grad_norm": 26.616584936339162, "learning_rate": 2.132295719844358e-05, "loss": 8.8878, "step": 137 }, { "epoch": 0.010744002724928228, "grad_norm": 25.55864765245493, "learning_rate": 2.1478599221789884e-05, "loss": 8.7109, "step": 138 }, { "epoch": 0.010821857817137853, "grad_norm": 19.09185015144203, "learning_rate": 2.1634241245136186e-05, "loss": 8.8231, "step": 139 }, { "epoch": 0.010899712909347476, "grad_norm": 26.445437306776675, "learning_rate": 2.178988326848249e-05, "loss": 8.7835, "step": 140 }, { "epoch": 0.010977568001557102, "grad_norm": 20.79227558080322, "learning_rate": 2.1945525291828797e-05, "loss": 8.8566, "step": 141 }, { "epoch": 0.011055423093766727, "grad_norm": 28.72561958586002, "learning_rate": 2.21011673151751e-05, "loss": 8.8962, "step": 142 }, { "epoch": 0.011133278185976352, "grad_norm": 25.210461307729858, "learning_rate": 2.2256809338521405e-05, "loss": 8.6334, "step": 143 }, { "epoch": 0.011211133278185976, "grad_norm": 24.023675913192026, "learning_rate": 2.2412451361867707e-05, "loss": 8.7517, "step": 144 }, { "epoch": 0.011288988370395601, "grad_norm": 21.561466115974483, "learning_rate": 2.256809338521401e-05, "loss": 8.4515, "step": 145 }, { "epoch": 0.011366843462605226, "grad_norm": 26.38565620888859, "learning_rate": 2.272373540856031e-05, "loss": 8.4877, "step": 146 }, { "epoch": 0.011444698554814852, "grad_norm": 19.74036515049233, "learning_rate": 2.2879377431906616e-05, "loss": 8.5625, "step": 147 }, { "epoch": 0.011522553647024475, "grad_norm": 31.347026500016796, "learning_rate": 2.303501945525292e-05, "loss": 8.4297, "step": 148 }, { "epoch": 0.0116004087392341, "grad_norm": 20.731179068400277, "learning_rate": 2.3190661478599224e-05, "loss": 8.3114, "step": 149 }, { "epoch": 0.011678263831443726, "grad_norm": 17.859131391402908, "learning_rate": 2.334630350194553e-05, "loss": 8.3259, "step": 150 }, { "epoch": 0.011678263831443726, "eval_loss": 1.0506458282470703, "eval_runtime": 162.1552, "eval_samples_per_second": 17.761, "eval_steps_per_second": 0.635, "step": 150 }, { "epoch": 0.01175611892365335, "grad_norm": 27.30566626043536, "learning_rate": 2.3501945525291832e-05, "loss": 8.5329, "step": 151 }, { "epoch": 0.011833974015862975, "grad_norm": 18.034759648997614, "learning_rate": 2.3657587548638134e-05, "loss": 8.2042, "step": 152 }, { "epoch": 0.0119118291080726, "grad_norm": 36.28035897691369, "learning_rate": 2.3813229571984436e-05, "loss": 8.2824, "step": 153 }, { "epoch": 0.011989684200282225, "grad_norm": 19.28794781372954, "learning_rate": 2.396887159533074e-05, "loss": 8.2316, "step": 154 }, { "epoch": 0.012067539292491849, "grad_norm": 25.741063524997717, "learning_rate": 2.4124513618677044e-05, "loss": 8.3114, "step": 155 }, { "epoch": 0.012145394384701474, "grad_norm": 23.01481969657276, "learning_rate": 2.4280155642023346e-05, "loss": 8.2645, "step": 156 }, { "epoch": 0.0122232494769111, "grad_norm": 33.174508157523256, "learning_rate": 2.4435797665369655e-05, "loss": 8.1624, "step": 157 }, { "epoch": 0.012301104569120725, "grad_norm": 21.70284436469417, "learning_rate": 2.4591439688715957e-05, "loss": 8.0218, "step": 158 }, { "epoch": 0.012378959661330348, "grad_norm": 30.60193681662741, "learning_rate": 2.474708171206226e-05, "loss": 7.9353, "step": 159 }, { "epoch": 0.012456814753539974, "grad_norm": 26.157210770006458, "learning_rate": 2.490272373540856e-05, "loss": 7.9849, "step": 160 }, { "epoch": 0.012534669845749599, "grad_norm": 21.659388140944888, "learning_rate": 2.5058365758754867e-05, "loss": 7.8722, "step": 161 }, { "epoch": 0.012612524937959224, "grad_norm": 21.265044731924903, "learning_rate": 2.521400778210117e-05, "loss": 7.5536, "step": 162 }, { "epoch": 0.012690380030168848, "grad_norm": 17.42563226305151, "learning_rate": 2.536964980544747e-05, "loss": 7.8968, "step": 163 }, { "epoch": 0.012768235122378473, "grad_norm": 17.834155830885642, "learning_rate": 2.5525291828793773e-05, "loss": 7.365, "step": 164 }, { "epoch": 0.012846090214588098, "grad_norm": 26.07262850142949, "learning_rate": 2.5680933852140082e-05, "loss": 7.5301, "step": 165 }, { "epoch": 0.012923945306797724, "grad_norm": 27.016058943873357, "learning_rate": 2.5836575875486384e-05, "loss": 7.5854, "step": 166 }, { "epoch": 0.013001800399007347, "grad_norm": 16.218866207204805, "learning_rate": 2.5992217898832686e-05, "loss": 7.505, "step": 167 }, { "epoch": 0.013079655491216972, "grad_norm": 19.474658460215057, "learning_rate": 2.614785992217899e-05, "loss": 7.5848, "step": 168 }, { "epoch": 0.013157510583426598, "grad_norm": 21.631272967843703, "learning_rate": 2.6303501945525294e-05, "loss": 7.2405, "step": 169 }, { "epoch": 0.013235365675636221, "grad_norm": 23.418590553747517, "learning_rate": 2.6459143968871596e-05, "loss": 7.3845, "step": 170 }, { "epoch": 0.013313220767845847, "grad_norm": 17.899087918898573, "learning_rate": 2.6614785992217898e-05, "loss": 7.159, "step": 171 }, { "epoch": 0.013391075860055472, "grad_norm": 17.199162540248903, "learning_rate": 2.6770428015564204e-05, "loss": 7.0664, "step": 172 }, { "epoch": 0.013468930952265097, "grad_norm": 23.198882353048255, "learning_rate": 2.692607003891051e-05, "loss": 7.1987, "step": 173 }, { "epoch": 0.01354678604447472, "grad_norm": 18.98570927796689, "learning_rate": 2.7081712062256815e-05, "loss": 7.0781, "step": 174 }, { "epoch": 0.013624641136684346, "grad_norm": 21.825629684014647, "learning_rate": 2.7237354085603117e-05, "loss": 7.0675, "step": 175 }, { "epoch": 0.013702496228893971, "grad_norm": 18.222797044508923, "learning_rate": 2.739299610894942e-05, "loss": 6.9872, "step": 176 }, { "epoch": 0.013780351321103597, "grad_norm": 24.597397205191996, "learning_rate": 2.754863813229572e-05, "loss": 6.9492, "step": 177 }, { "epoch": 0.01385820641331322, "grad_norm": 19.81385818927919, "learning_rate": 2.7704280155642027e-05, "loss": 6.8331, "step": 178 }, { "epoch": 0.013936061505522845, "grad_norm": 35.063878205798666, "learning_rate": 2.785992217898833e-05, "loss": 6.8744, "step": 179 }, { "epoch": 0.01401391659773247, "grad_norm": 15.741940602720065, "learning_rate": 2.801556420233463e-05, "loss": 6.827, "step": 180 }, { "epoch": 0.014091771689942096, "grad_norm": 30.462437620401584, "learning_rate": 2.817120622568094e-05, "loss": 6.8404, "step": 181 }, { "epoch": 0.01416962678215172, "grad_norm": 22.084133371649045, "learning_rate": 2.8326848249027242e-05, "loss": 7.2042, "step": 182 }, { "epoch": 0.014247481874361345, "grad_norm": 23.095610019264463, "learning_rate": 2.8482490272373544e-05, "loss": 6.7879, "step": 183 }, { "epoch": 0.01432533696657097, "grad_norm": 24.909439349024844, "learning_rate": 2.8638132295719846e-05, "loss": 6.736, "step": 184 }, { "epoch": 0.014403192058780595, "grad_norm": 20.57030766705449, "learning_rate": 2.879377431906615e-05, "loss": 6.8259, "step": 185 }, { "epoch": 0.014481047150990219, "grad_norm": 17.57077189686258, "learning_rate": 2.8949416342412454e-05, "loss": 6.601, "step": 186 }, { "epoch": 0.014558902243199844, "grad_norm": 14.863760152248929, "learning_rate": 2.9105058365758756e-05, "loss": 6.2405, "step": 187 }, { "epoch": 0.01463675733540947, "grad_norm": 22.46661172144308, "learning_rate": 2.9260700389105058e-05, "loss": 6.6512, "step": 188 }, { "epoch": 0.014714612427619095, "grad_norm": 17.728970728605066, "learning_rate": 2.9416342412451367e-05, "loss": 6.2952, "step": 189 }, { "epoch": 0.014792467519828718, "grad_norm": 15.359017051877998, "learning_rate": 2.957198443579767e-05, "loss": 6.264, "step": 190 }, { "epoch": 0.014870322612038344, "grad_norm": 16.808413144429387, "learning_rate": 2.972762645914397e-05, "loss": 6.4129, "step": 191 }, { "epoch": 0.014948177704247969, "grad_norm": 19.6817846817645, "learning_rate": 2.9883268482490277e-05, "loss": 6.3175, "step": 192 }, { "epoch": 0.015026032796457593, "grad_norm": 21.647762553954767, "learning_rate": 3.003891050583658e-05, "loss": 6.3482, "step": 193 }, { "epoch": 0.015103887888667218, "grad_norm": 17.084568772285454, "learning_rate": 3.019455252918288e-05, "loss": 6.2427, "step": 194 }, { "epoch": 0.015181742980876843, "grad_norm": 14.941991590435205, "learning_rate": 3.0350194552529183e-05, "loss": 6.2902, "step": 195 }, { "epoch": 0.015259598073086468, "grad_norm": 15.491068962663801, "learning_rate": 3.050583657587549e-05, "loss": 6.3415, "step": 196 }, { "epoch": 0.015337453165296092, "grad_norm": 15.351476645070544, "learning_rate": 3.0661478599221794e-05, "loss": 6.0184, "step": 197 }, { "epoch": 0.015415308257505717, "grad_norm": 17.518130426522664, "learning_rate": 3.08171206225681e-05, "loss": 6.2455, "step": 198 }, { "epoch": 0.015493163349715343, "grad_norm": 15.275038178375764, "learning_rate": 3.09727626459144e-05, "loss": 6.067, "step": 199 }, { "epoch": 0.015571018441924968, "grad_norm": 18.050184290280257, "learning_rate": 3.1128404669260704e-05, "loss": 6.0921, "step": 200 }, { "epoch": 0.015571018441924968, "eval_loss": 0.7552824020385742, "eval_runtime": 162.1093, "eval_samples_per_second": 17.766, "eval_steps_per_second": 0.635, "step": 200 }, { "epoch": 0.01564887353413459, "grad_norm": 15.467542769342975, "learning_rate": 3.128404669260701e-05, "loss": 6.1641, "step": 201 }, { "epoch": 0.01572672862634422, "grad_norm": 14.740882884147089, "learning_rate": 3.143968871595331e-05, "loss": 5.8636, "step": 202 }, { "epoch": 0.015804583718553842, "grad_norm": 15.216941366042365, "learning_rate": 3.1595330739299614e-05, "loss": 5.8518, "step": 203 }, { "epoch": 0.015882438810763466, "grad_norm": 12.620562619167517, "learning_rate": 3.175097276264591e-05, "loss": 5.9958, "step": 204 }, { "epoch": 0.015960293902973093, "grad_norm": 17.803168572178702, "learning_rate": 3.1906614785992225e-05, "loss": 5.8309, "step": 205 }, { "epoch": 0.016038148995182716, "grad_norm": 14.920616538451824, "learning_rate": 3.2062256809338523e-05, "loss": 5.6738, "step": 206 }, { "epoch": 0.01611600408739234, "grad_norm": 18.086291707791727, "learning_rate": 3.221789883268483e-05, "loss": 5.8599, "step": 207 }, { "epoch": 0.016193859179601967, "grad_norm": 16.875745931962605, "learning_rate": 3.2373540856031135e-05, "loss": 5.9191, "step": 208 }, { "epoch": 0.01627171427181159, "grad_norm": 16.213498672519588, "learning_rate": 3.252918287937743e-05, "loss": 5.7296, "step": 209 }, { "epoch": 0.016349569364021217, "grad_norm": 19.27539505030557, "learning_rate": 3.268482490272374e-05, "loss": 5.8691, "step": 210 }, { "epoch": 0.01642742445623084, "grad_norm": 16.019904547501085, "learning_rate": 3.284046692607004e-05, "loss": 5.4367, "step": 211 }, { "epoch": 0.016505279548440464, "grad_norm": 13.757874617650941, "learning_rate": 3.299610894941634e-05, "loss": 5.75, "step": 212 }, { "epoch": 0.01658313464065009, "grad_norm": 17.992254423342725, "learning_rate": 3.315175097276265e-05, "loss": 5.8281, "step": 213 }, { "epoch": 0.016660989732859715, "grad_norm": 16.432268652180927, "learning_rate": 3.3307392996108954e-05, "loss": 5.6551, "step": 214 }, { "epoch": 0.01673884482506934, "grad_norm": 12.928424375741908, "learning_rate": 3.346303501945526e-05, "loss": 5.6152, "step": 215 }, { "epoch": 0.016816699917278966, "grad_norm": 12.408493436400482, "learning_rate": 3.361867704280156e-05, "loss": 5.5273, "step": 216 }, { "epoch": 0.01689455500948859, "grad_norm": 13.268209803076985, "learning_rate": 3.3774319066147864e-05, "loss": 5.4233, "step": 217 }, { "epoch": 0.016972410101698213, "grad_norm": 14.454869684993751, "learning_rate": 3.392996108949416e-05, "loss": 5.6261, "step": 218 }, { "epoch": 0.01705026519390784, "grad_norm": 13.67324320718479, "learning_rate": 3.408560311284047e-05, "loss": 5.423, "step": 219 }, { "epoch": 0.017128120286117463, "grad_norm": 17.356825047270373, "learning_rate": 3.4241245136186774e-05, "loss": 5.5692, "step": 220 }, { "epoch": 0.01720597537832709, "grad_norm": 11.821964293676844, "learning_rate": 3.439688715953308e-05, "loss": 5.3359, "step": 221 }, { "epoch": 0.017283830470536714, "grad_norm": 14.090207695302004, "learning_rate": 3.4552529182879385e-05, "loss": 5.3449, "step": 222 }, { "epoch": 0.017361685562746337, "grad_norm": 14.031415832762907, "learning_rate": 3.4708171206225683e-05, "loss": 5.6696, "step": 223 }, { "epoch": 0.017439540654955964, "grad_norm": 17.678732118370306, "learning_rate": 3.486381322957199e-05, "loss": 5.6208, "step": 224 }, { "epoch": 0.017517395747165588, "grad_norm": 12.531938923423303, "learning_rate": 3.501945525291829e-05, "loss": 5.418, "step": 225 }, { "epoch": 0.01759525083937521, "grad_norm": 18.047445664195095, "learning_rate": 3.517509727626459e-05, "loss": 5.365, "step": 226 }, { "epoch": 0.01767310593158484, "grad_norm": 12.079642607514598, "learning_rate": 3.53307392996109e-05, "loss": 5.2249, "step": 227 }, { "epoch": 0.017750961023794462, "grad_norm": 14.89914814095269, "learning_rate": 3.54863813229572e-05, "loss": 5.3432, "step": 228 }, { "epoch": 0.01782881611600409, "grad_norm": 13.108829681630151, "learning_rate": 3.564202334630351e-05, "loss": 5.2737, "step": 229 }, { "epoch": 0.017906671208213713, "grad_norm": 10.99124610294337, "learning_rate": 3.579766536964981e-05, "loss": 5.2609, "step": 230 }, { "epoch": 0.017984526300423336, "grad_norm": 13.362213185951811, "learning_rate": 3.5953307392996114e-05, "loss": 5.1705, "step": 231 }, { "epoch": 0.018062381392632963, "grad_norm": 11.873247365487005, "learning_rate": 3.610894941634241e-05, "loss": 5.1501, "step": 232 }, { "epoch": 0.018140236484842587, "grad_norm": 16.64578136436058, "learning_rate": 3.626459143968872e-05, "loss": 5.2012, "step": 233 }, { "epoch": 0.01821809157705221, "grad_norm": 12.327384165162691, "learning_rate": 3.6420233463035024e-05, "loss": 5.1666, "step": 234 }, { "epoch": 0.018295946669261837, "grad_norm": 10.421075057367464, "learning_rate": 3.657587548638132e-05, "loss": 5.1007, "step": 235 }, { "epoch": 0.01837380176147146, "grad_norm": 11.74265471914553, "learning_rate": 3.673151750972763e-05, "loss": 5.0332, "step": 236 }, { "epoch": 0.018451656853681084, "grad_norm": 12.541553503315761, "learning_rate": 3.6887159533073934e-05, "loss": 4.9646, "step": 237 }, { "epoch": 0.01852951194589071, "grad_norm": 13.607050280792892, "learning_rate": 3.704280155642024e-05, "loss": 5.0971, "step": 238 }, { "epoch": 0.018607367038100335, "grad_norm": 18.825662045238023, "learning_rate": 3.719844357976654e-05, "loss": 5.1018, "step": 239 }, { "epoch": 0.018685222130309962, "grad_norm": 10.720423311685373, "learning_rate": 3.735408560311284e-05, "loss": 5.0988, "step": 240 }, { "epoch": 0.018763077222519586, "grad_norm": 14.723609327663151, "learning_rate": 3.750972762645915e-05, "loss": 4.9732, "step": 241 }, { "epoch": 0.01884093231472921, "grad_norm": 10.944740628423082, "learning_rate": 3.766536964980545e-05, "loss": 4.7478, "step": 242 }, { "epoch": 0.018918787406938836, "grad_norm": 12.230051627356593, "learning_rate": 3.782101167315175e-05, "loss": 5.0131, "step": 243 }, { "epoch": 0.01899664249914846, "grad_norm": 10.967792186116919, "learning_rate": 3.797665369649806e-05, "loss": 4.9902, "step": 244 }, { "epoch": 0.019074497591358083, "grad_norm": 9.907708366606155, "learning_rate": 3.8132295719844364e-05, "loss": 4.6819, "step": 245 }, { "epoch": 0.01915235268356771, "grad_norm": 10.494313960979623, "learning_rate": 3.828793774319066e-05, "loss": 5.1842, "step": 246 }, { "epoch": 0.019230207775777334, "grad_norm": 12.127910866804823, "learning_rate": 3.844357976653697e-05, "loss": 5.0268, "step": 247 }, { "epoch": 0.01930806286798696, "grad_norm": 11.549618091496555, "learning_rate": 3.8599221789883274e-05, "loss": 4.8585, "step": 248 }, { "epoch": 0.019385917960196584, "grad_norm": 12.6429644155397, "learning_rate": 3.875486381322957e-05, "loss": 4.9406, "step": 249 }, { "epoch": 0.019463773052406208, "grad_norm": 11.747726687286086, "learning_rate": 3.891050583657588e-05, "loss": 4.7176, "step": 250 }, { "epoch": 0.019463773052406208, "eval_loss": 0.6106075644493103, "eval_runtime": 162.2206, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 250 }, { "epoch": 0.019541628144615835, "grad_norm": 10.479890602451624, "learning_rate": 3.9066147859922184e-05, "loss": 4.6811, "step": 251 }, { "epoch": 0.01961948323682546, "grad_norm": 13.102907371903006, "learning_rate": 3.922178988326848e-05, "loss": 4.7254, "step": 252 }, { "epoch": 0.019697338329035082, "grad_norm": 14.18881843596508, "learning_rate": 3.9377431906614795e-05, "loss": 4.781, "step": 253 }, { "epoch": 0.01977519342124471, "grad_norm": 11.858714245158215, "learning_rate": 3.9533073929961093e-05, "loss": 4.8267, "step": 254 }, { "epoch": 0.019853048513454333, "grad_norm": 11.246949862182145, "learning_rate": 3.96887159533074e-05, "loss": 4.8571, "step": 255 }, { "epoch": 0.019930903605663956, "grad_norm": 12.26973098692896, "learning_rate": 3.98443579766537e-05, "loss": 4.7109, "step": 256 }, { "epoch": 0.020008758697873583, "grad_norm": 11.966310331575647, "learning_rate": 4e-05, "loss": 4.6526, "step": 257 }, { "epoch": 0.020086613790083207, "grad_norm": 10.483447001940847, "learning_rate": 3.9999999377047007e-05, "loss": 4.7162, "step": 258 }, { "epoch": 0.020164468882292834, "grad_norm": 11.899484059039171, "learning_rate": 3.999999750818806e-05, "loss": 4.8013, "step": 259 }, { "epoch": 0.020242323974502457, "grad_norm": 11.839300656408453, "learning_rate": 3.999999439342327e-05, "loss": 4.7729, "step": 260 }, { "epoch": 0.02032017906671208, "grad_norm": 9.327719100654104, "learning_rate": 3.999999003275285e-05, "loss": 4.7383, "step": 261 }, { "epoch": 0.020398034158921708, "grad_norm": 11.739109632589912, "learning_rate": 3.9999984426177044e-05, "loss": 4.7924, "step": 262 }, { "epoch": 0.02047588925113133, "grad_norm": 11.27118226226202, "learning_rate": 3.999997757369623e-05, "loss": 4.6032, "step": 263 }, { "epoch": 0.020553744343340955, "grad_norm": 10.491116235758065, "learning_rate": 3.9999969475310816e-05, "loss": 4.5285, "step": 264 }, { "epoch": 0.020631599435550582, "grad_norm": 10.061467832416175, "learning_rate": 3.9999960131021306e-05, "loss": 4.6169, "step": 265 }, { "epoch": 0.020709454527760206, "grad_norm": 10.038002387744271, "learning_rate": 3.99999495408283e-05, "loss": 4.445, "step": 266 }, { "epoch": 0.020787309619969833, "grad_norm": 10.236322251021582, "learning_rate": 3.999993770473244e-05, "loss": 4.3305, "step": 267 }, { "epoch": 0.020865164712179456, "grad_norm": 10.117031593277567, "learning_rate": 3.9999924622734464e-05, "loss": 4.4824, "step": 268 }, { "epoch": 0.02094301980438908, "grad_norm": 10.663515149589823, "learning_rate": 3.99999102948352e-05, "loss": 4.2595, "step": 269 }, { "epoch": 0.021020874896598707, "grad_norm": 11.026544609507976, "learning_rate": 3.999989472103552e-05, "loss": 4.4607, "step": 270 }, { "epoch": 0.02109872998880833, "grad_norm": 12.707161300781825, "learning_rate": 3.999987790133641e-05, "loss": 4.4548, "step": 271 }, { "epoch": 0.021176585081017954, "grad_norm": 12.716457295991836, "learning_rate": 3.9999859835738926e-05, "loss": 4.3901, "step": 272 }, { "epoch": 0.02125444017322758, "grad_norm": 9.97497662223691, "learning_rate": 3.9999840524244174e-05, "loss": 4.4389, "step": 273 }, { "epoch": 0.021332295265437205, "grad_norm": 10.390538125386763, "learning_rate": 3.999981996685337e-05, "loss": 4.4046, "step": 274 }, { "epoch": 0.021410150357646828, "grad_norm": 9.579164937138087, "learning_rate": 3.9999798163567785e-05, "loss": 4.5393, "step": 275 }, { "epoch": 0.021488005449856455, "grad_norm": 10.400188252112907, "learning_rate": 3.999977511438878e-05, "loss": 4.546, "step": 276 }, { "epoch": 0.02156586054206608, "grad_norm": 9.571219654979428, "learning_rate": 3.99997508193178e-05, "loss": 4.4685, "step": 277 }, { "epoch": 0.021643715634275706, "grad_norm": 10.698691856315946, "learning_rate": 3.999972527835634e-05, "loss": 4.363, "step": 278 }, { "epoch": 0.02172157072648533, "grad_norm": 10.438854847698764, "learning_rate": 3.999969849150601e-05, "loss": 4.4174, "step": 279 }, { "epoch": 0.021799425818694953, "grad_norm": 9.272334054133177, "learning_rate": 3.9999670458768474e-05, "loss": 4.4542, "step": 280 }, { "epoch": 0.02187728091090458, "grad_norm": 9.941326893838612, "learning_rate": 3.999964118014547e-05, "loss": 4.1836, "step": 281 }, { "epoch": 0.021955136003114203, "grad_norm": 11.090049990832467, "learning_rate": 3.999961065563882e-05, "loss": 4.2697, "step": 282 }, { "epoch": 0.022032991095323827, "grad_norm": 10.928166781341407, "learning_rate": 3.9999578885250446e-05, "loss": 4.3446, "step": 283 }, { "epoch": 0.022110846187533454, "grad_norm": 12.475689756175333, "learning_rate": 3.999954586898231e-05, "loss": 4.2157, "step": 284 }, { "epoch": 0.022188701279743078, "grad_norm": 9.267402539119972, "learning_rate": 3.9999511606836466e-05, "loss": 4.0268, "step": 285 }, { "epoch": 0.022266556371952705, "grad_norm": 8.710760704583763, "learning_rate": 3.999947609881506e-05, "loss": 4.1613, "step": 286 }, { "epoch": 0.022344411464162328, "grad_norm": 11.597155281608371, "learning_rate": 3.99994393449203e-05, "loss": 4.2514, "step": 287 }, { "epoch": 0.02242226655637195, "grad_norm": 8.905052962627568, "learning_rate": 3.999940134515447e-05, "loss": 4.0259, "step": 288 }, { "epoch": 0.02250012164858158, "grad_norm": 8.59327507626874, "learning_rate": 3.9999362099519945e-05, "loss": 4.0667, "step": 289 }, { "epoch": 0.022577976740791202, "grad_norm": 8.029065533104099, "learning_rate": 3.999932160801917e-05, "loss": 4.1956, "step": 290 }, { "epoch": 0.022655831833000826, "grad_norm": 11.603465825224704, "learning_rate": 3.9999279870654654e-05, "loss": 4.3145, "step": 291 }, { "epoch": 0.022733686925210453, "grad_norm": 8.208276159557446, "learning_rate": 3.999923688742902e-05, "loss": 4.3309, "step": 292 }, { "epoch": 0.022811542017420076, "grad_norm": 10.124772751437916, "learning_rate": 3.9999192658344926e-05, "loss": 3.9852, "step": 293 }, { "epoch": 0.022889397109629703, "grad_norm": 9.920239587743648, "learning_rate": 3.9999147183405134e-05, "loss": 4.2503, "step": 294 }, { "epoch": 0.022967252201839327, "grad_norm": 12.04369676419899, "learning_rate": 3.999910046261247e-05, "loss": 4.192, "step": 295 }, { "epoch": 0.02304510729404895, "grad_norm": 9.065424700072072, "learning_rate": 3.999905249596986e-05, "loss": 4.1381, "step": 296 }, { "epoch": 0.023122962386258578, "grad_norm": 8.661799049807106, "learning_rate": 3.999900328348028e-05, "loss": 4.1747, "step": 297 }, { "epoch": 0.0232008174784682, "grad_norm": 9.728404812185602, "learning_rate": 3.99989528251468e-05, "loss": 3.9774, "step": 298 }, { "epoch": 0.023278672570677825, "grad_norm": 8.446763487080284, "learning_rate": 3.999890112097256e-05, "loss": 4.2193, "step": 299 }, { "epoch": 0.02335652766288745, "grad_norm": 10.738330479880522, "learning_rate": 3.999884817096079e-05, "loss": 4.2291, "step": 300 }, { "epoch": 0.02335652766288745, "eval_loss": 0.5308761596679688, "eval_runtime": 162.1846, "eval_samples_per_second": 17.758, "eval_steps_per_second": 0.635, "step": 300 }, { "epoch": 0.023434382755097075, "grad_norm": 9.368526029819618, "learning_rate": 3.999879397511477e-05, "loss": 4.0977, "step": 301 }, { "epoch": 0.0235122378473067, "grad_norm": 11.37398574323803, "learning_rate": 3.9998738533437895e-05, "loss": 4.1406, "step": 302 }, { "epoch": 0.023590092939516326, "grad_norm": 14.975891054355294, "learning_rate": 3.9998681845933615e-05, "loss": 4.3025, "step": 303 }, { "epoch": 0.02366794803172595, "grad_norm": 19.904391038418424, "learning_rate": 3.999862391260546e-05, "loss": 4.2994, "step": 304 }, { "epoch": 0.023745803123935576, "grad_norm": 79.6389388832343, "learning_rate": 3.9998564733457025e-05, "loss": 6.0413, "step": 305 }, { "epoch": 0.0238236582161452, "grad_norm": 18.52117681967074, "learning_rate": 3.9998504308492026e-05, "loss": 4.796, "step": 306 }, { "epoch": 0.023901513308354824, "grad_norm": 8.965914742319898, "learning_rate": 3.9998442637714194e-05, "loss": 4.2723, "step": 307 }, { "epoch": 0.02397936840056445, "grad_norm": 10.433866811700746, "learning_rate": 3.999837972112739e-05, "loss": 4.3446, "step": 308 }, { "epoch": 0.024057223492774074, "grad_norm": 9.624083171209858, "learning_rate": 3.999831555873553e-05, "loss": 4.0075, "step": 309 }, { "epoch": 0.024135078584983698, "grad_norm": 8.836485655373322, "learning_rate": 3.999825015054262e-05, "loss": 4.192, "step": 310 }, { "epoch": 0.024212933677193325, "grad_norm": 9.389906511792384, "learning_rate": 3.9998183496552716e-05, "loss": 4.1228, "step": 311 }, { "epoch": 0.024290788769402948, "grad_norm": 10.534023122794743, "learning_rate": 3.999811559676999e-05, "loss": 4.1141, "step": 312 }, { "epoch": 0.024368643861612575, "grad_norm": 10.713105564288929, "learning_rate": 3.9998046451198654e-05, "loss": 4.1311, "step": 313 }, { "epoch": 0.0244464989538222, "grad_norm": 7.441538535623895, "learning_rate": 3.999797605984302e-05, "loss": 3.9743, "step": 314 }, { "epoch": 0.024524354046031822, "grad_norm": 10.408667762755407, "learning_rate": 3.9997904422707486e-05, "loss": 4.0765, "step": 315 }, { "epoch": 0.02460220913824145, "grad_norm": 10.611386024592742, "learning_rate": 3.9997831539796496e-05, "loss": 4.0, "step": 316 }, { "epoch": 0.024680064230451073, "grad_norm": 8.173579249212773, "learning_rate": 3.9997757411114604e-05, "loss": 3.9623, "step": 317 }, { "epoch": 0.024757919322660697, "grad_norm": 10.872645969823505, "learning_rate": 3.999768203666642e-05, "loss": 3.933, "step": 318 }, { "epoch": 0.024835774414870324, "grad_norm": 7.490020807210309, "learning_rate": 3.999760541645664e-05, "loss": 3.8097, "step": 319 }, { "epoch": 0.024913629507079947, "grad_norm": 8.028694066796438, "learning_rate": 3.9997527550490045e-05, "loss": 3.805, "step": 320 }, { "epoch": 0.02499148459928957, "grad_norm": 8.030952065143936, "learning_rate": 3.999744843877148e-05, "loss": 3.8689, "step": 321 }, { "epoch": 0.025069339691499198, "grad_norm": 7.743314950279922, "learning_rate": 3.999736808130587e-05, "loss": 4.0204, "step": 322 }, { "epoch": 0.02514719478370882, "grad_norm": 7.792301084556439, "learning_rate": 3.999728647809823e-05, "loss": 3.9333, "step": 323 }, { "epoch": 0.02522504987591845, "grad_norm": 9.895899962459652, "learning_rate": 3.9997203629153636e-05, "loss": 3.8577, "step": 324 }, { "epoch": 0.025302904968128072, "grad_norm": 8.40044383545745, "learning_rate": 3.999711953447725e-05, "loss": 3.9598, "step": 325 }, { "epoch": 0.025380760060337695, "grad_norm": 6.9073985423678455, "learning_rate": 3.999703419407431e-05, "loss": 3.8703, "step": 326 }, { "epoch": 0.025458615152547322, "grad_norm": 10.345028985923795, "learning_rate": 3.999694760795014e-05, "loss": 3.8555, "step": 327 }, { "epoch": 0.025536470244756946, "grad_norm": 9.87411019617178, "learning_rate": 3.999685977611012e-05, "loss": 3.9545, "step": 328 }, { "epoch": 0.02561432533696657, "grad_norm": 6.926220717049819, "learning_rate": 3.999677069855973e-05, "loss": 3.9453, "step": 329 }, { "epoch": 0.025692180429176197, "grad_norm": 10.002044784473442, "learning_rate": 3.9996680375304525e-05, "loss": 3.8135, "step": 330 }, { "epoch": 0.02577003552138582, "grad_norm": 7.708764547330901, "learning_rate": 3.999658880635013e-05, "loss": 3.8557, "step": 331 }, { "epoch": 0.025847890613595447, "grad_norm": 8.03021486063973, "learning_rate": 3.9996495991702226e-05, "loss": 3.8742, "step": 332 }, { "epoch": 0.02592574570580507, "grad_norm": 9.42981813328865, "learning_rate": 3.999640193136663e-05, "loss": 3.7506, "step": 333 }, { "epoch": 0.026003600798014694, "grad_norm": 7.845712042425165, "learning_rate": 3.999630662534918e-05, "loss": 3.7913, "step": 334 }, { "epoch": 0.02608145589022432, "grad_norm": 11.282945550222426, "learning_rate": 3.999621007365582e-05, "loss": 3.8075, "step": 335 }, { "epoch": 0.026159310982433945, "grad_norm": 9.168531455681366, "learning_rate": 3.999611227629255e-05, "loss": 3.7813, "step": 336 }, { "epoch": 0.02623716607464357, "grad_norm": 8.439150103566899, "learning_rate": 3.999601323326549e-05, "loss": 3.7068, "step": 337 }, { "epoch": 0.026315021166853195, "grad_norm": 9.010142243969984, "learning_rate": 3.999591294458079e-05, "loss": 3.7383, "step": 338 }, { "epoch": 0.02639287625906282, "grad_norm": 7.826491289217161, "learning_rate": 3.9995811410244705e-05, "loss": 3.7037, "step": 339 }, { "epoch": 0.026470731351272443, "grad_norm": 6.6671045390372194, "learning_rate": 3.999570863026356e-05, "loss": 3.6292, "step": 340 }, { "epoch": 0.02654858644348207, "grad_norm": 7.835189629632363, "learning_rate": 3.9995604604643754e-05, "loss": 3.7715, "step": 341 }, { "epoch": 0.026626441535691693, "grad_norm": 6.919785210442813, "learning_rate": 3.999549933339176e-05, "loss": 3.7451, "step": 342 }, { "epoch": 0.02670429662790132, "grad_norm": 6.849358469600196, "learning_rate": 3.999539281651415e-05, "loss": 3.5725, "step": 343 }, { "epoch": 0.026782151720110944, "grad_norm": 7.213572316142895, "learning_rate": 3.999528505401756e-05, "loss": 3.7321, "step": 344 }, { "epoch": 0.026860006812320567, "grad_norm": 10.498787012304499, "learning_rate": 3.999517604590869e-05, "loss": 3.8242, "step": 345 }, { "epoch": 0.026937861904530194, "grad_norm": 9.070192776062102, "learning_rate": 3.9995065792194344e-05, "loss": 3.5879, "step": 346 }, { "epoch": 0.027015716996739818, "grad_norm": 6.841271871593934, "learning_rate": 3.999495429288138e-05, "loss": 3.8075, "step": 347 }, { "epoch": 0.02709357208894944, "grad_norm": 8.431477540150862, "learning_rate": 3.999484154797675e-05, "loss": 3.798, "step": 348 }, { "epoch": 0.02717142718115907, "grad_norm": 7.178545858027762, "learning_rate": 3.999472755748747e-05, "loss": 3.6378, "step": 349 }, { "epoch": 0.027249282273368692, "grad_norm": 7.652721913026168, "learning_rate": 3.999461232142065e-05, "loss": 3.5776, "step": 350 }, { "epoch": 0.027249282273368692, "eval_loss": 0.45219314098358154, "eval_runtime": 161.8735, "eval_samples_per_second": 17.792, "eval_steps_per_second": 0.636, "step": 350 }, { "epoch": 0.02732713736557832, "grad_norm": 7.155337381891977, "learning_rate": 3.9994495839783475e-05, "loss": 3.6071, "step": 351 }, { "epoch": 0.027404992457787943, "grad_norm": 10.14684553217658, "learning_rate": 3.999437811258318e-05, "loss": 3.4118, "step": 352 }, { "epoch": 0.027482847549997566, "grad_norm": 8.358447776875252, "learning_rate": 3.9994259139827113e-05, "loss": 3.4967, "step": 353 }, { "epoch": 0.027560702642207193, "grad_norm": 6.122197605946078, "learning_rate": 3.9994138921522686e-05, "loss": 3.5667, "step": 354 }, { "epoch": 0.027638557734416817, "grad_norm": 8.343155306101417, "learning_rate": 3.9994017457677384e-05, "loss": 3.4835, "step": 355 }, { "epoch": 0.02771641282662644, "grad_norm": 7.688783727034534, "learning_rate": 3.999389474829877e-05, "loss": 3.6152, "step": 356 }, { "epoch": 0.027794267918836067, "grad_norm": 8.326929550787964, "learning_rate": 3.999377079339449e-05, "loss": 3.5829, "step": 357 }, { "epoch": 0.02787212301104569, "grad_norm": 7.290573829359664, "learning_rate": 3.9993645592972285e-05, "loss": 3.5379, "step": 358 }, { "epoch": 0.027949978103255314, "grad_norm": 6.343000780679945, "learning_rate": 3.999351914703992e-05, "loss": 3.4026, "step": 359 }, { "epoch": 0.02802783319546494, "grad_norm": 7.208175193493985, "learning_rate": 3.9993391455605305e-05, "loss": 3.6098, "step": 360 }, { "epoch": 0.028105688287674565, "grad_norm": 7.59439166329816, "learning_rate": 3.999326251867637e-05, "loss": 3.5525, "step": 361 }, { "epoch": 0.028183543379884192, "grad_norm": 7.9771040576479795, "learning_rate": 3.999313233626115e-05, "loss": 3.5183, "step": 362 }, { "epoch": 0.028261398472093816, "grad_norm": 8.218489118828819, "learning_rate": 3.999300090836778e-05, "loss": 3.5028, "step": 363 }, { "epoch": 0.02833925356430344, "grad_norm": 8.214095996870148, "learning_rate": 3.999286823500442e-05, "loss": 3.4997, "step": 364 }, { "epoch": 0.028417108656513066, "grad_norm": 7.162054301205459, "learning_rate": 3.999273431617934e-05, "loss": 3.4927, "step": 365 }, { "epoch": 0.02849496374872269, "grad_norm": 9.337008921108, "learning_rate": 3.999259915190089e-05, "loss": 3.7104, "step": 366 }, { "epoch": 0.028572818840932313, "grad_norm": 7.496971341264565, "learning_rate": 3.999246274217749e-05, "loss": 3.5089, "step": 367 }, { "epoch": 0.02865067393314194, "grad_norm": 8.734653150312996, "learning_rate": 3.999232508701762e-05, "loss": 3.4167, "step": 368 }, { "epoch": 0.028728529025351564, "grad_norm": 6.546671389914708, "learning_rate": 3.999218618642988e-05, "loss": 3.3634, "step": 369 }, { "epoch": 0.02880638411756119, "grad_norm": 7.525401953380788, "learning_rate": 3.999204604042291e-05, "loss": 3.397, "step": 370 }, { "epoch": 0.028884239209770814, "grad_norm": 6.960697047148772, "learning_rate": 3.999190464900544e-05, "loss": 3.3232, "step": 371 }, { "epoch": 0.028962094301980438, "grad_norm": 8.555995319456102, "learning_rate": 3.999176201218629e-05, "loss": 3.4877, "step": 372 }, { "epoch": 0.029039949394190065, "grad_norm": 8.795664763567812, "learning_rate": 3.9991618129974334e-05, "loss": 3.4629, "step": 373 }, { "epoch": 0.02911780448639969, "grad_norm": 6.7051262755584355, "learning_rate": 3.9991473002378534e-05, "loss": 3.3574, "step": 374 }, { "epoch": 0.029195659578609312, "grad_norm": 7.480777617073177, "learning_rate": 3.999132662940793e-05, "loss": 3.3803, "step": 375 }, { "epoch": 0.02927351467081894, "grad_norm": 8.452998105843863, "learning_rate": 3.999117901107165e-05, "loss": 3.5134, "step": 376 }, { "epoch": 0.029351369763028563, "grad_norm": 7.862007396421669, "learning_rate": 3.999103014737889e-05, "loss": 3.5739, "step": 377 }, { "epoch": 0.02942922485523819, "grad_norm": 6.94034982826261, "learning_rate": 3.99908800383389e-05, "loss": 3.4548, "step": 378 }, { "epoch": 0.029507079947447813, "grad_norm": 7.439142133329545, "learning_rate": 3.9990728683961066e-05, "loss": 3.2838, "step": 379 }, { "epoch": 0.029584935039657437, "grad_norm": 6.893930495212844, "learning_rate": 3.999057608425479e-05, "loss": 3.1917, "step": 380 }, { "epoch": 0.029662790131867064, "grad_norm": 7.959171464811116, "learning_rate": 3.999042223922959e-05, "loss": 3.3256, "step": 381 }, { "epoch": 0.029740645224076687, "grad_norm": 6.703343485214345, "learning_rate": 3.999026714889505e-05, "loss": 3.2469, "step": 382 }, { "epoch": 0.02981850031628631, "grad_norm": 7.9368842586242705, "learning_rate": 3.999011081326083e-05, "loss": 3.3493, "step": 383 }, { "epoch": 0.029896355408495938, "grad_norm": 6.567251283107349, "learning_rate": 3.9989953232336665e-05, "loss": 3.2416, "step": 384 }, { "epoch": 0.02997421050070556, "grad_norm": 6.7494647622162685, "learning_rate": 3.998979440613238e-05, "loss": 3.4287, "step": 385 }, { "epoch": 0.030052065592915185, "grad_norm": 7.728525129197748, "learning_rate": 3.998963433465786e-05, "loss": 3.2358, "step": 386 }, { "epoch": 0.030129920685124812, "grad_norm": 6.315861602089964, "learning_rate": 3.998947301792308e-05, "loss": 3.3477, "step": 387 }, { "epoch": 0.030207775777334436, "grad_norm": 6.360137270028921, "learning_rate": 3.998931045593809e-05, "loss": 3.2521, "step": 388 }, { "epoch": 0.030285630869544063, "grad_norm": 6.541076203916866, "learning_rate": 3.9989146648713016e-05, "loss": 3.4068, "step": 389 }, { "epoch": 0.030363485961753686, "grad_norm": 8.013535769578514, "learning_rate": 3.998898159625807e-05, "loss": 3.1359, "step": 390 }, { "epoch": 0.03044134105396331, "grad_norm": 8.331868345090317, "learning_rate": 3.998881529858352e-05, "loss": 3.4559, "step": 391 }, { "epoch": 0.030519196146172937, "grad_norm": 5.850646992034681, "learning_rate": 3.998864775569973e-05, "loss": 3.4272, "step": 392 }, { "epoch": 0.03059705123838256, "grad_norm": 7.340273863008032, "learning_rate": 3.9988478967617145e-05, "loss": 3.2977, "step": 393 }, { "epoch": 0.030674906330592184, "grad_norm": 6.744602025040305, "learning_rate": 3.9988308934346276e-05, "loss": 3.3348, "step": 394 }, { "epoch": 0.03075276142280181, "grad_norm": 6.482360149527259, "learning_rate": 3.9988137655897705e-05, "loss": 3.1963, "step": 395 }, { "epoch": 0.030830616515011434, "grad_norm": 8.740607948900097, "learning_rate": 3.998796513228212e-05, "loss": 3.2891, "step": 396 }, { "epoch": 0.03090847160722106, "grad_norm": 7.9915024736832985, "learning_rate": 3.998779136351026e-05, "loss": 3.168, "step": 397 }, { "epoch": 0.030986326699430685, "grad_norm": 10.099512872886823, "learning_rate": 3.998761634959294e-05, "loss": 3.22, "step": 398 }, { "epoch": 0.03106418179164031, "grad_norm": 6.669831138838547, "learning_rate": 3.998744009054108e-05, "loss": 3.3535, "step": 399 }, { "epoch": 0.031142036883849936, "grad_norm": 6.192028641315858, "learning_rate": 3.9987262586365644e-05, "loss": 3.2864, "step": 400 }, { "epoch": 0.031142036883849936, "eval_loss": 0.4091295599937439, "eval_runtime": 161.9868, "eval_samples_per_second": 17.779, "eval_steps_per_second": 0.636, "step": 400 }, { "epoch": 0.03121989197605956, "grad_norm": 8.063579640276592, "learning_rate": 3.9987083837077704e-05, "loss": 3.1922, "step": 401 }, { "epoch": 0.03129774706826918, "grad_norm": 5.903710538695727, "learning_rate": 3.998690384268838e-05, "loss": 3.2959, "step": 402 }, { "epoch": 0.031375602160478806, "grad_norm": 7.674187558826993, "learning_rate": 3.9986722603208904e-05, "loss": 3.4026, "step": 403 }, { "epoch": 0.03145345725268844, "grad_norm": 6.237624772512992, "learning_rate": 3.998654011865055e-05, "loss": 2.9821, "step": 404 }, { "epoch": 0.03153131234489806, "grad_norm": 8.312482262021568, "learning_rate": 3.998635638902469e-05, "loss": 3.0938, "step": 405 }, { "epoch": 0.031609167437107684, "grad_norm": 7.209155838463676, "learning_rate": 3.9986171414342776e-05, "loss": 3.103, "step": 406 }, { "epoch": 0.03168702252931731, "grad_norm": 7.551458772121457, "learning_rate": 3.998598519461633e-05, "loss": 3.1822, "step": 407 }, { "epoch": 0.03176487762152693, "grad_norm": 7.029892223773923, "learning_rate": 3.9985797729856935e-05, "loss": 3.351, "step": 408 }, { "epoch": 0.031842732713736555, "grad_norm": 7.223548372363803, "learning_rate": 3.998560902007629e-05, "loss": 3.2426, "step": 409 }, { "epoch": 0.031920587805946185, "grad_norm": 7.830200662813156, "learning_rate": 3.9985419065286146e-05, "loss": 3.1747, "step": 410 }, { "epoch": 0.03199844289815581, "grad_norm": 5.294760585305304, "learning_rate": 3.998522786549833e-05, "loss": 3.084, "step": 411 }, { "epoch": 0.03207629799036543, "grad_norm": 6.332490780437531, "learning_rate": 3.998503542072476e-05, "loss": 3.1328, "step": 412 }, { "epoch": 0.032154153082575056, "grad_norm": 7.976241200660227, "learning_rate": 3.998484173097742e-05, "loss": 3.2185, "step": 413 }, { "epoch": 0.03223200817478468, "grad_norm": 6.5657063366117265, "learning_rate": 3.998464679626838e-05, "loss": 3.113, "step": 414 }, { "epoch": 0.03230986326699431, "grad_norm": 5.940801209068665, "learning_rate": 3.998445061660978e-05, "loss": 3.0043, "step": 415 }, { "epoch": 0.03238771835920393, "grad_norm": 5.916043473150777, "learning_rate": 3.998425319201384e-05, "loss": 3.0989, "step": 416 }, { "epoch": 0.03246557345141356, "grad_norm": 7.040152153855754, "learning_rate": 3.998405452249285e-05, "loss": 3.2093, "step": 417 }, { "epoch": 0.03254342854362318, "grad_norm": 7.043188676630723, "learning_rate": 3.998385460805922e-05, "loss": 3.091, "step": 418 }, { "epoch": 0.032621283635832804, "grad_norm": 7.0263181959421015, "learning_rate": 3.998365344872536e-05, "loss": 3.2241, "step": 419 }, { "epoch": 0.032699138728042434, "grad_norm": 7.259260230484285, "learning_rate": 3.998345104450383e-05, "loss": 3.1095, "step": 420 }, { "epoch": 0.03277699382025206, "grad_norm": 6.391483081459429, "learning_rate": 3.998324739540723e-05, "loss": 3.0696, "step": 421 }, { "epoch": 0.03285484891246168, "grad_norm": 7.772318800157866, "learning_rate": 3.998304250144824e-05, "loss": 3.1756, "step": 422 }, { "epoch": 0.032932704004671305, "grad_norm": 5.630049278229408, "learning_rate": 3.9982836362639644e-05, "loss": 2.9336, "step": 423 }, { "epoch": 0.03301055909688093, "grad_norm": 7.777765950575289, "learning_rate": 3.998262897899426e-05, "loss": 3.1222, "step": 424 }, { "epoch": 0.03308841418909055, "grad_norm": 6.756199442396918, "learning_rate": 3.998242035052502e-05, "loss": 3.1256, "step": 425 }, { "epoch": 0.03316626928130018, "grad_norm": 7.277491578806928, "learning_rate": 3.998221047724492e-05, "loss": 3.1387, "step": 426 }, { "epoch": 0.033244124373509806, "grad_norm": 6.38376918999622, "learning_rate": 3.998199935916703e-05, "loss": 3.0151, "step": 427 }, { "epoch": 0.03332197946571943, "grad_norm": 5.417406591624217, "learning_rate": 3.99817869963045e-05, "loss": 3.1309, "step": 428 }, { "epoch": 0.03339983455792905, "grad_norm": 6.351818681719743, "learning_rate": 3.998157338867057e-05, "loss": 3.1145, "step": 429 }, { "epoch": 0.03347768965013868, "grad_norm": 6.483439487106309, "learning_rate": 3.9981358536278536e-05, "loss": 3.114, "step": 430 }, { "epoch": 0.03355554474234831, "grad_norm": 6.378126098867989, "learning_rate": 3.998114243914179e-05, "loss": 3.1016, "step": 431 }, { "epoch": 0.03363339983455793, "grad_norm": 7.3151249403780865, "learning_rate": 3.998092509727378e-05, "loss": 3.1491, "step": 432 }, { "epoch": 0.033711254926767555, "grad_norm": 6.4541007469303775, "learning_rate": 3.998070651068807e-05, "loss": 2.9388, "step": 433 }, { "epoch": 0.03378911001897718, "grad_norm": 6.010247108699217, "learning_rate": 3.9980486679398245e-05, "loss": 2.999, "step": 434 }, { "epoch": 0.0338669651111868, "grad_norm": 6.85878040428994, "learning_rate": 3.998026560341802e-05, "loss": 3.007, "step": 435 }, { "epoch": 0.033944820203396425, "grad_norm": 5.7670105046254285, "learning_rate": 3.998004328276117e-05, "loss": 2.9128, "step": 436 }, { "epoch": 0.034022675295606056, "grad_norm": 5.940835090378987, "learning_rate": 3.997981971744154e-05, "loss": 2.9443, "step": 437 }, { "epoch": 0.03410053038781568, "grad_norm": 6.139950176732448, "learning_rate": 3.9979594907473054e-05, "loss": 2.9743, "step": 438 }, { "epoch": 0.0341783854800253, "grad_norm": 6.9170586732881025, "learning_rate": 3.997936885286971e-05, "loss": 3.1609, "step": 439 }, { "epoch": 0.034256240572234926, "grad_norm": 7.595865221875089, "learning_rate": 3.997914155364561e-05, "loss": 2.9721, "step": 440 }, { "epoch": 0.03433409566444455, "grad_norm": 6.144056825037216, "learning_rate": 3.997891300981489e-05, "loss": 2.8428, "step": 441 }, { "epoch": 0.03441195075665418, "grad_norm": 6.422629768242784, "learning_rate": 3.9978683221391804e-05, "loss": 3.2052, "step": 442 }, { "epoch": 0.034489805848863804, "grad_norm": 6.050822577963971, "learning_rate": 3.997845218839065e-05, "loss": 3.0045, "step": 443 }, { "epoch": 0.03456766094107343, "grad_norm": 7.14190229079742, "learning_rate": 3.9978219910825846e-05, "loss": 3.1423, "step": 444 }, { "epoch": 0.03464551603328305, "grad_norm": 6.209459852067398, "learning_rate": 3.997798638871184e-05, "loss": 3.0929, "step": 445 }, { "epoch": 0.034723371125492675, "grad_norm": 6.636524042129381, "learning_rate": 3.997775162206319e-05, "loss": 3.0145, "step": 446 }, { "epoch": 0.0348012262177023, "grad_norm": 6.381061478224473, "learning_rate": 3.9977515610894516e-05, "loss": 3.0684, "step": 447 }, { "epoch": 0.03487908130991193, "grad_norm": 7.220674017652841, "learning_rate": 3.997727835522053e-05, "loss": 2.9125, "step": 448 }, { "epoch": 0.03495693640212155, "grad_norm": 6.595084666452877, "learning_rate": 3.9977039855055996e-05, "loss": 2.922, "step": 449 }, { "epoch": 0.035034791494331176, "grad_norm": 6.034486439289878, "learning_rate": 3.997680011041578e-05, "loss": 3.0378, "step": 450 }, { "epoch": 0.035034791494331176, "eval_loss": 0.3684728741645813, "eval_runtime": 162.6543, "eval_samples_per_second": 17.706, "eval_steps_per_second": 0.633, "step": 450 }, { "epoch": 0.0351126465865408, "grad_norm": 5.882558403792554, "learning_rate": 3.997655912131482e-05, "loss": 2.9976, "step": 451 }, { "epoch": 0.03519050167875042, "grad_norm": 6.638069523929824, "learning_rate": 3.997631688776812e-05, "loss": 2.981, "step": 452 }, { "epoch": 0.03526835677096005, "grad_norm": 6.150894513024609, "learning_rate": 3.997607340979078e-05, "loss": 2.8969, "step": 453 }, { "epoch": 0.03534621186316968, "grad_norm": 5.218911469672009, "learning_rate": 3.9975828687397954e-05, "loss": 2.9874, "step": 454 }, { "epoch": 0.0354240669553793, "grad_norm": 6.338180091397565, "learning_rate": 3.99755827206049e-05, "loss": 2.9784, "step": 455 }, { "epoch": 0.035501922047588924, "grad_norm": 6.66351553912053, "learning_rate": 3.9975335509426945e-05, "loss": 2.8694, "step": 456 }, { "epoch": 0.03557977713979855, "grad_norm": 5.6385742559106715, "learning_rate": 3.997508705387946e-05, "loss": 2.8306, "step": 457 }, { "epoch": 0.03565763223200818, "grad_norm": 6.261914257414027, "learning_rate": 3.997483735397796e-05, "loss": 2.8364, "step": 458 }, { "epoch": 0.0357354873242178, "grad_norm": 5.313221334721657, "learning_rate": 3.997458640973798e-05, "loss": 2.8425, "step": 459 }, { "epoch": 0.035813342416427425, "grad_norm": 6.266375078646286, "learning_rate": 3.997433422117516e-05, "loss": 2.8525, "step": 460 }, { "epoch": 0.03589119750863705, "grad_norm": 7.3616922861330805, "learning_rate": 3.997408078830519e-05, "loss": 2.8693, "step": 461 }, { "epoch": 0.03596905260084667, "grad_norm": 5.856018064348467, "learning_rate": 3.997382611114388e-05, "loss": 2.9266, "step": 462 }, { "epoch": 0.036046907693056296, "grad_norm": 6.337187116661087, "learning_rate": 3.9973570189707094e-05, "loss": 2.8608, "step": 463 }, { "epoch": 0.036124762785265926, "grad_norm": 5.932066531250987, "learning_rate": 3.997331302401077e-05, "loss": 2.826, "step": 464 }, { "epoch": 0.03620261787747555, "grad_norm": 5.841459644698147, "learning_rate": 3.997305461407092e-05, "loss": 2.9293, "step": 465 }, { "epoch": 0.036280472969685174, "grad_norm": 6.508233742990281, "learning_rate": 3.997279495990365e-05, "loss": 2.8125, "step": 466 }, { "epoch": 0.0363583280618948, "grad_norm": 5.555770137762838, "learning_rate": 3.9972534061525136e-05, "loss": 2.9761, "step": 467 }, { "epoch": 0.03643618315410442, "grad_norm": 6.123662060072845, "learning_rate": 3.997227191895163e-05, "loss": 2.912, "step": 468 }, { "epoch": 0.03651403824631405, "grad_norm": 4.854595805746336, "learning_rate": 3.9972008532199466e-05, "loss": 2.7803, "step": 469 }, { "epoch": 0.036591893338523675, "grad_norm": 5.539494611936039, "learning_rate": 3.997174390128504e-05, "loss": 2.8397, "step": 470 }, { "epoch": 0.0366697484307333, "grad_norm": 6.09727326682445, "learning_rate": 3.997147802622485e-05, "loss": 2.8729, "step": 471 }, { "epoch": 0.03674760352294292, "grad_norm": 7.180677775234213, "learning_rate": 3.997121090703545e-05, "loss": 2.8189, "step": 472 }, { "epoch": 0.036825458615152545, "grad_norm": 5.8238699649798615, "learning_rate": 3.9970942543733483e-05, "loss": 2.8997, "step": 473 }, { "epoch": 0.03690331370736217, "grad_norm": 5.595131093277113, "learning_rate": 3.997067293633567e-05, "loss": 2.7174, "step": 474 }, { "epoch": 0.0369811687995718, "grad_norm": 5.682826387154349, "learning_rate": 3.99704020848588e-05, "loss": 2.8283, "step": 475 }, { "epoch": 0.03705902389178142, "grad_norm": 5.812268205582246, "learning_rate": 3.9970129989319756e-05, "loss": 2.9348, "step": 476 }, { "epoch": 0.03713687898399105, "grad_norm": 5.52436222297427, "learning_rate": 3.996985664973547e-05, "loss": 2.7305, "step": 477 }, { "epoch": 0.03721473407620067, "grad_norm": 6.599785138282688, "learning_rate": 3.996958206612299e-05, "loss": 2.9051, "step": 478 }, { "epoch": 0.037292589168410294, "grad_norm": 7.158053810507964, "learning_rate": 3.996930623849941e-05, "loss": 2.9653, "step": 479 }, { "epoch": 0.037370444260619924, "grad_norm": 7.504551725701539, "learning_rate": 3.996902916688192e-05, "loss": 2.8433, "step": 480 }, { "epoch": 0.03744829935282955, "grad_norm": 5.57721976616752, "learning_rate": 3.996875085128777e-05, "loss": 2.922, "step": 481 }, { "epoch": 0.03752615444503917, "grad_norm": 6.410003564862024, "learning_rate": 3.9968471291734304e-05, "loss": 2.7094, "step": 482 }, { "epoch": 0.037604009537248795, "grad_norm": 5.722159162393937, "learning_rate": 3.996819048823894e-05, "loss": 2.7407, "step": 483 }, { "epoch": 0.03768186462945842, "grad_norm": 5.567755964015476, "learning_rate": 3.9967908440819164e-05, "loss": 2.7347, "step": 484 }, { "epoch": 0.03775971972166804, "grad_norm": 5.754548429427662, "learning_rate": 3.996762514949255e-05, "loss": 2.8055, "step": 485 }, { "epoch": 0.03783757481387767, "grad_norm": 4.8531529093414045, "learning_rate": 3.9967340614276745e-05, "loss": 2.7247, "step": 486 }, { "epoch": 0.037915429906087296, "grad_norm": 7.101175122017643, "learning_rate": 3.9967054835189475e-05, "loss": 2.846, "step": 487 }, { "epoch": 0.03799328499829692, "grad_norm": 7.6321238428962195, "learning_rate": 3.9966767812248545e-05, "loss": 2.9286, "step": 488 }, { "epoch": 0.03807114009050654, "grad_norm": 6.261242899687711, "learning_rate": 3.996647954547183e-05, "loss": 2.8205, "step": 489 }, { "epoch": 0.03814899518271617, "grad_norm": 5.277633768375664, "learning_rate": 3.996619003487728e-05, "loss": 2.703, "step": 490 }, { "epoch": 0.0382268502749258, "grad_norm": 5.878946599720253, "learning_rate": 3.996589928048295e-05, "loss": 2.6882, "step": 491 }, { "epoch": 0.03830470536713542, "grad_norm": 6.231859537024604, "learning_rate": 3.9965607282306955e-05, "loss": 2.7352, "step": 492 }, { "epoch": 0.038382560459345044, "grad_norm": 5.353057547669011, "learning_rate": 3.9965314040367456e-05, "loss": 2.7136, "step": 493 }, { "epoch": 0.03846041555155467, "grad_norm": 6.467444460252131, "learning_rate": 3.9965019554682744e-05, "loss": 2.726, "step": 494 }, { "epoch": 0.03853827064376429, "grad_norm": 6.350714685849469, "learning_rate": 3.9964723825271156e-05, "loss": 2.7455, "step": 495 }, { "epoch": 0.03861612573597392, "grad_norm": 4.8443075757018805, "learning_rate": 3.996442685215112e-05, "loss": 2.7175, "step": 496 }, { "epoch": 0.038693980828183545, "grad_norm": 4.787798826549846, "learning_rate": 3.996412863534113e-05, "loss": 2.7139, "step": 497 }, { "epoch": 0.03877183592039317, "grad_norm": 6.344883725853351, "learning_rate": 3.996382917485976e-05, "loss": 2.7528, "step": 498 }, { "epoch": 0.03884969101260279, "grad_norm": 6.255030122606726, "learning_rate": 3.996352847072567e-05, "loss": 2.696, "step": 499 }, { "epoch": 0.038927546104812416, "grad_norm": 5.380527814688454, "learning_rate": 3.99632265229576e-05, "loss": 2.695, "step": 500 }, { "epoch": 0.038927546104812416, "eval_loss": 0.3429034650325775, "eval_runtime": 162.5995, "eval_samples_per_second": 17.712, "eval_steps_per_second": 0.633, "step": 500 }, { "epoch": 0.03900540119702204, "grad_norm": 6.124689864968937, "learning_rate": 3.996292333157436e-05, "loss": 2.7469, "step": 501 }, { "epoch": 0.03908325628923167, "grad_norm": 5.24322630210827, "learning_rate": 3.9962618896594815e-05, "loss": 2.7839, "step": 502 }, { "epoch": 0.039161111381441294, "grad_norm": 6.016528066674608, "learning_rate": 3.9962313218037954e-05, "loss": 2.7017, "step": 503 }, { "epoch": 0.03923896647365092, "grad_norm": 6.055137900582529, "learning_rate": 3.996200629592281e-05, "loss": 2.6557, "step": 504 }, { "epoch": 0.03931682156586054, "grad_norm": 6.44462273669886, "learning_rate": 3.99616981302685e-05, "loss": 2.7204, "step": 505 }, { "epoch": 0.039394676658070164, "grad_norm": 5.757709625470139, "learning_rate": 3.9961388721094226e-05, "loss": 2.6625, "step": 506 }, { "epoch": 0.039472531750279795, "grad_norm": 6.534667718888337, "learning_rate": 3.996107806841926e-05, "loss": 2.7679, "step": 507 }, { "epoch": 0.03955038684248942, "grad_norm": 5.823351332135792, "learning_rate": 3.996076617226296e-05, "loss": 2.6825, "step": 508 }, { "epoch": 0.03962824193469904, "grad_norm": 6.070376761401637, "learning_rate": 3.996045303264475e-05, "loss": 2.7571, "step": 509 }, { "epoch": 0.039706097026908665, "grad_norm": 5.631361373962709, "learning_rate": 3.996013864958414e-05, "loss": 2.6514, "step": 510 }, { "epoch": 0.03978395211911829, "grad_norm": 4.900287073014881, "learning_rate": 3.9959823023100716e-05, "loss": 2.5001, "step": 511 }, { "epoch": 0.03986180721132791, "grad_norm": 5.322326604934521, "learning_rate": 3.9959506153214124e-05, "loss": 2.7088, "step": 512 }, { "epoch": 0.03993966230353754, "grad_norm": 5.6246830265482375, "learning_rate": 3.9959188039944125e-05, "loss": 2.7084, "step": 513 }, { "epoch": 0.04001751739574717, "grad_norm": 5.1787379929182515, "learning_rate": 3.995886868331053e-05, "loss": 2.6189, "step": 514 }, { "epoch": 0.04009537248795679, "grad_norm": 6.461023087434521, "learning_rate": 3.995854808333323e-05, "loss": 2.7097, "step": 515 }, { "epoch": 0.040173227580166414, "grad_norm": 5.928836221514381, "learning_rate": 3.9958226240032194e-05, "loss": 2.8352, "step": 516 }, { "epoch": 0.04025108267237604, "grad_norm": 5.384374608337842, "learning_rate": 3.9957903153427476e-05, "loss": 2.6666, "step": 517 }, { "epoch": 0.04032893776458567, "grad_norm": 4.738124831348933, "learning_rate": 3.9957578823539206e-05, "loss": 2.546, "step": 518 }, { "epoch": 0.04040679285679529, "grad_norm": 5.464573392436495, "learning_rate": 3.9957253250387584e-05, "loss": 2.6611, "step": 519 }, { "epoch": 0.040484647949004915, "grad_norm": 5.864163657211923, "learning_rate": 3.995692643399289e-05, "loss": 2.6049, "step": 520 }, { "epoch": 0.04056250304121454, "grad_norm": 4.76492308048879, "learning_rate": 3.995659837437548e-05, "loss": 2.6063, "step": 521 }, { "epoch": 0.04064035813342416, "grad_norm": 5.735611241089373, "learning_rate": 3.99562690715558e-05, "loss": 2.6943, "step": 522 }, { "epoch": 0.04071821322563379, "grad_norm": 4.9210407817873225, "learning_rate": 3.995593852555436e-05, "loss": 2.5693, "step": 523 }, { "epoch": 0.040796068317843416, "grad_norm": 5.467681927532633, "learning_rate": 3.995560673639175e-05, "loss": 2.6779, "step": 524 }, { "epoch": 0.04087392341005304, "grad_norm": 5.973400072127545, "learning_rate": 3.995527370408864e-05, "loss": 2.7323, "step": 525 }, { "epoch": 0.04095177850226266, "grad_norm": 5.661594520109599, "learning_rate": 3.9954939428665776e-05, "loss": 2.6415, "step": 526 }, { "epoch": 0.04102963359447229, "grad_norm": 6.974510857694689, "learning_rate": 3.995460391014398e-05, "loss": 2.6279, "step": 527 }, { "epoch": 0.04110748868668191, "grad_norm": 6.072746828065688, "learning_rate": 3.995426714854416e-05, "loss": 2.6843, "step": 528 }, { "epoch": 0.04118534377889154, "grad_norm": 5.003118117974469, "learning_rate": 3.995392914388729e-05, "loss": 2.614, "step": 529 }, { "epoch": 0.041263198871101164, "grad_norm": 4.848210520962919, "learning_rate": 3.995358989619443e-05, "loss": 2.5495, "step": 530 }, { "epoch": 0.04134105396331079, "grad_norm": 5.6252985459024325, "learning_rate": 3.99532494054867e-05, "loss": 2.6537, "step": 531 }, { "epoch": 0.04141890905552041, "grad_norm": 5.492551123481622, "learning_rate": 3.995290767178532e-05, "loss": 2.483, "step": 532 }, { "epoch": 0.041496764147730035, "grad_norm": 6.01008195240802, "learning_rate": 3.995256469511159e-05, "loss": 2.7852, "step": 533 }, { "epoch": 0.041574619239939666, "grad_norm": 4.98593182898864, "learning_rate": 3.995222047548686e-05, "loss": 2.6503, "step": 534 }, { "epoch": 0.04165247433214929, "grad_norm": 4.923222361761973, "learning_rate": 3.995187501293258e-05, "loss": 2.6374, "step": 535 }, { "epoch": 0.04173032942435891, "grad_norm": 5.255135486111619, "learning_rate": 3.9951528307470265e-05, "loss": 2.5519, "step": 536 }, { "epoch": 0.041808184516568536, "grad_norm": 5.864787452004835, "learning_rate": 3.9951180359121515e-05, "loss": 2.677, "step": 537 }, { "epoch": 0.04188603960877816, "grad_norm": 6.1011815200957376, "learning_rate": 3.995083116790801e-05, "loss": 2.6751, "step": 538 }, { "epoch": 0.04196389470098778, "grad_norm": 5.329300214759463, "learning_rate": 3.99504807338515e-05, "loss": 2.4496, "step": 539 }, { "epoch": 0.042041749793197414, "grad_norm": 4.7549870113763015, "learning_rate": 3.9950129056973816e-05, "loss": 2.5328, "step": 540 }, { "epoch": 0.04211960488540704, "grad_norm": 5.481509568659044, "learning_rate": 3.994977613729687e-05, "loss": 2.6309, "step": 541 }, { "epoch": 0.04219745997761666, "grad_norm": 5.638345998801199, "learning_rate": 3.994942197484263e-05, "loss": 2.4816, "step": 542 }, { "epoch": 0.042275315069826284, "grad_norm": 4.685975062789357, "learning_rate": 3.994906656963318e-05, "loss": 2.5145, "step": 543 }, { "epoch": 0.04235317016203591, "grad_norm": 5.311126150192787, "learning_rate": 3.994870992169065e-05, "loss": 2.5618, "step": 544 }, { "epoch": 0.04243102525424554, "grad_norm": 5.259103945695794, "learning_rate": 3.994835203103726e-05, "loss": 2.6263, "step": 545 }, { "epoch": 0.04250888034645516, "grad_norm": 4.2032747587650165, "learning_rate": 3.994799289769531e-05, "loss": 2.4436, "step": 546 }, { "epoch": 0.042586735438664786, "grad_norm": 5.045306748413166, "learning_rate": 3.9947632521687156e-05, "loss": 2.4171, "step": 547 }, { "epoch": 0.04266459053087441, "grad_norm": 4.901665921867398, "learning_rate": 3.994727090303527e-05, "loss": 2.5349, "step": 548 }, { "epoch": 0.04274244562308403, "grad_norm": 4.4808211701753615, "learning_rate": 3.994690804176216e-05, "loss": 2.5484, "step": 549 }, { "epoch": 0.042820300715293656, "grad_norm": 5.33723971188634, "learning_rate": 3.994654393789043e-05, "loss": 2.5222, "step": 550 }, { "epoch": 0.042820300715293656, "eval_loss": 0.3230932056903839, "eval_runtime": 162.1847, "eval_samples_per_second": 17.758, "eval_steps_per_second": 0.635, "step": 550 }, { "epoch": 0.04289815580750329, "grad_norm": 7.6078793261573985, "learning_rate": 3.994617859144279e-05, "loss": 2.5682, "step": 551 }, { "epoch": 0.04297601089971291, "grad_norm": 5.295209645285475, "learning_rate": 3.994581200244196e-05, "loss": 2.5873, "step": 552 }, { "epoch": 0.043053865991922534, "grad_norm": 4.413393358394375, "learning_rate": 3.9945444170910804e-05, "loss": 2.4996, "step": 553 }, { "epoch": 0.04313172108413216, "grad_norm": 5.145978466502656, "learning_rate": 3.994507509687223e-05, "loss": 2.549, "step": 554 }, { "epoch": 0.04320957617634178, "grad_norm": 5.204102624762245, "learning_rate": 3.9944704780349216e-05, "loss": 2.4023, "step": 555 }, { "epoch": 0.04328743126855141, "grad_norm": 5.076053437786019, "learning_rate": 3.994433322136486e-05, "loss": 2.5724, "step": 556 }, { "epoch": 0.043365286360761035, "grad_norm": 4.818687205156679, "learning_rate": 3.994396041994228e-05, "loss": 2.4491, "step": 557 }, { "epoch": 0.04344314145297066, "grad_norm": 4.876391452492393, "learning_rate": 3.994358637610471e-05, "loss": 2.4251, "step": 558 }, { "epoch": 0.04352099654518028, "grad_norm": 6.460056186607403, "learning_rate": 3.994321108987545e-05, "loss": 2.5665, "step": 559 }, { "epoch": 0.043598851637389906, "grad_norm": 6.08209108903879, "learning_rate": 3.9942834561277885e-05, "loss": 2.633, "step": 560 }, { "epoch": 0.043676706729599536, "grad_norm": 4.293357125479193, "learning_rate": 3.994245679033547e-05, "loss": 2.5766, "step": 561 }, { "epoch": 0.04375456182180916, "grad_norm": 5.748184474314906, "learning_rate": 3.994207777707173e-05, "loss": 2.4746, "step": 562 }, { "epoch": 0.04383241691401878, "grad_norm": 5.109379875460896, "learning_rate": 3.994169752151028e-05, "loss": 2.4752, "step": 563 }, { "epoch": 0.04391027200622841, "grad_norm": 4.814979517056638, "learning_rate": 3.994131602367481e-05, "loss": 2.4736, "step": 564 }, { "epoch": 0.04398812709843803, "grad_norm": 4.670728585459004, "learning_rate": 3.994093328358909e-05, "loss": 2.5476, "step": 565 }, { "epoch": 0.044065982190647654, "grad_norm": 5.32697923483008, "learning_rate": 3.994054930127695e-05, "loss": 2.5834, "step": 566 }, { "epoch": 0.044143837282857284, "grad_norm": 5.947231432133837, "learning_rate": 3.994016407676232e-05, "loss": 2.5013, "step": 567 }, { "epoch": 0.04422169237506691, "grad_norm": 5.175324825359515, "learning_rate": 3.9939777610069196e-05, "loss": 2.4424, "step": 568 }, { "epoch": 0.04429954746727653, "grad_norm": 5.195632111906956, "learning_rate": 3.9939389901221645e-05, "loss": 2.3506, "step": 569 }, { "epoch": 0.044377402559486155, "grad_norm": 5.157682309899753, "learning_rate": 3.9939000950243836e-05, "loss": 2.54, "step": 570 }, { "epoch": 0.04445525765169578, "grad_norm": 4.8480776009136894, "learning_rate": 3.993861075715998e-05, "loss": 2.4973, "step": 571 }, { "epoch": 0.04453311274390541, "grad_norm": 5.220151200899133, "learning_rate": 3.99382193219944e-05, "loss": 2.4071, "step": 572 }, { "epoch": 0.04461096783611503, "grad_norm": 4.395529892531457, "learning_rate": 3.9937826644771475e-05, "loss": 2.3948, "step": 573 }, { "epoch": 0.044688822928324656, "grad_norm": 5.15109498988179, "learning_rate": 3.9937432725515665e-05, "loss": 2.3856, "step": 574 }, { "epoch": 0.04476667802053428, "grad_norm": 4.611521597325651, "learning_rate": 3.99370375642515e-05, "loss": 2.4538, "step": 575 }, { "epoch": 0.0448445331127439, "grad_norm": 5.110397649122663, "learning_rate": 3.993664116100362e-05, "loss": 2.5223, "step": 576 }, { "epoch": 0.04492238820495353, "grad_norm": 5.937001574468982, "learning_rate": 3.9936243515796705e-05, "loss": 2.5253, "step": 577 }, { "epoch": 0.04500024329716316, "grad_norm": 5.109663490954265, "learning_rate": 3.9935844628655526e-05, "loss": 2.4724, "step": 578 }, { "epoch": 0.04507809838937278, "grad_norm": 4.670229004039174, "learning_rate": 3.993544449960493e-05, "loss": 2.4706, "step": 579 }, { "epoch": 0.045155953481582405, "grad_norm": 4.828032147780479, "learning_rate": 3.9935043128669855e-05, "loss": 2.3516, "step": 580 }, { "epoch": 0.04523380857379203, "grad_norm": 4.760014282714932, "learning_rate": 3.9934640515875284e-05, "loss": 2.4397, "step": 581 }, { "epoch": 0.04531166366600165, "grad_norm": 4.471706288566635, "learning_rate": 3.993423666124632e-05, "loss": 2.3573, "step": 582 }, { "epoch": 0.04538951875821128, "grad_norm": 5.39504740192963, "learning_rate": 3.993383156480811e-05, "loss": 2.4109, "step": 583 }, { "epoch": 0.045467373850420906, "grad_norm": 4.762463443270962, "learning_rate": 3.993342522658588e-05, "loss": 2.3657, "step": 584 }, { "epoch": 0.04554522894263053, "grad_norm": 3.987296971021684, "learning_rate": 3.993301764660497e-05, "loss": 2.4947, "step": 585 }, { "epoch": 0.04562308403484015, "grad_norm": 5.710882611179033, "learning_rate": 3.993260882489075e-05, "loss": 2.4757, "step": 586 }, { "epoch": 0.045700939127049776, "grad_norm": 5.9393317024057115, "learning_rate": 3.993219876146868e-05, "loss": 2.4941, "step": 587 }, { "epoch": 0.04577879421925941, "grad_norm": 4.395500985513923, "learning_rate": 3.9931787456364324e-05, "loss": 2.5071, "step": 588 }, { "epoch": 0.04585664931146903, "grad_norm": 3.707319708392615, "learning_rate": 3.99313749096033e-05, "loss": 2.2821, "step": 589 }, { "epoch": 0.045934504403678654, "grad_norm": 4.810446554739099, "learning_rate": 3.9930961121211303e-05, "loss": 2.3002, "step": 590 }, { "epoch": 0.04601235949588828, "grad_norm": 4.844905397420168, "learning_rate": 3.993054609121411e-05, "loss": 2.4113, "step": 591 }, { "epoch": 0.0460902145880979, "grad_norm": 4.624368187184992, "learning_rate": 3.993012981963758e-05, "loss": 2.3152, "step": 592 }, { "epoch": 0.046168069680307525, "grad_norm": 4.359760525466341, "learning_rate": 3.992971230650764e-05, "loss": 2.3248, "step": 593 }, { "epoch": 0.046245924772517155, "grad_norm": 5.163692357389221, "learning_rate": 3.9929293551850305e-05, "loss": 2.5199, "step": 594 }, { "epoch": 0.04632377986472678, "grad_norm": 5.299869314259809, "learning_rate": 3.992887355569165e-05, "loss": 2.4336, "step": 595 }, { "epoch": 0.0464016349569364, "grad_norm": 4.650001152888138, "learning_rate": 3.9928452318057854e-05, "loss": 2.4093, "step": 596 }, { "epoch": 0.046479490049146026, "grad_norm": 4.471982318314604, "learning_rate": 3.9928029838975144e-05, "loss": 2.3559, "step": 597 }, { "epoch": 0.04655734514135565, "grad_norm": 4.8657886036787605, "learning_rate": 3.992760611846985e-05, "loss": 2.4418, "step": 598 }, { "epoch": 0.04663520023356528, "grad_norm": 5.208282266441516, "learning_rate": 3.992718115656836e-05, "loss": 2.475, "step": 599 }, { "epoch": 0.0467130553257749, "grad_norm": 5.209940808475318, "learning_rate": 3.9926754953297153e-05, "loss": 2.3883, "step": 600 }, { "epoch": 0.0467130553257749, "eval_loss": 0.30325043201446533, "eval_runtime": 162.355, "eval_samples_per_second": 17.739, "eval_steps_per_second": 0.634, "step": 600 }, { "epoch": 0.04679091041798453, "grad_norm": 4.583643951688881, "learning_rate": 3.9926327508682775e-05, "loss": 2.3746, "step": 601 }, { "epoch": 0.04686876551019415, "grad_norm": 4.402357346657991, "learning_rate": 3.992589882275186e-05, "loss": 2.3548, "step": 602 }, { "epoch": 0.046946620602403774, "grad_norm": 4.382633943053607, "learning_rate": 3.992546889553111e-05, "loss": 2.4554, "step": 603 }, { "epoch": 0.0470244756946134, "grad_norm": 4.727706391685363, "learning_rate": 3.992503772704729e-05, "loss": 2.3746, "step": 604 }, { "epoch": 0.04710233078682303, "grad_norm": 4.770700663240834, "learning_rate": 3.992460531732729e-05, "loss": 2.3209, "step": 605 }, { "epoch": 0.04718018587903265, "grad_norm": 4.522978643540047, "learning_rate": 3.992417166639802e-05, "loss": 2.3415, "step": 606 }, { "epoch": 0.047258040971242275, "grad_norm": 4.776302269085311, "learning_rate": 3.9923736774286516e-05, "loss": 2.2623, "step": 607 }, { "epoch": 0.0473358960634519, "grad_norm": 5.273245381968566, "learning_rate": 3.992330064101986e-05, "loss": 2.4983, "step": 608 }, { "epoch": 0.04741375115566152, "grad_norm": 6.028933959562421, "learning_rate": 3.9922863266625225e-05, "loss": 2.4004, "step": 609 }, { "epoch": 0.04749160624787115, "grad_norm": 5.371256490695498, "learning_rate": 3.992242465112985e-05, "loss": 2.4182, "step": 610 }, { "epoch": 0.047569461340080776, "grad_norm": 5.132204059270486, "learning_rate": 3.992198479456106e-05, "loss": 2.4556, "step": 611 }, { "epoch": 0.0476473164322904, "grad_norm": 5.540238860263224, "learning_rate": 3.992154369694626e-05, "loss": 2.3811, "step": 612 }, { "epoch": 0.047725171524500024, "grad_norm": 5.0225189711747, "learning_rate": 3.992110135831294e-05, "loss": 2.34, "step": 613 }, { "epoch": 0.04780302661670965, "grad_norm": 4.932856405359925, "learning_rate": 3.9920657778688625e-05, "loss": 2.4572, "step": 614 }, { "epoch": 0.04788088170891927, "grad_norm": 5.138228594383906, "learning_rate": 3.992021295810098e-05, "loss": 2.3991, "step": 615 }, { "epoch": 0.0479587368011289, "grad_norm": 5.461011510686317, "learning_rate": 3.991976689657769e-05, "loss": 2.3645, "step": 616 }, { "epoch": 0.048036591893338525, "grad_norm": 4.826001613720935, "learning_rate": 3.991931959414656e-05, "loss": 2.3622, "step": 617 }, { "epoch": 0.04811444698554815, "grad_norm": 4.171596431095917, "learning_rate": 3.9918871050835444e-05, "loss": 2.3768, "step": 618 }, { "epoch": 0.04819230207775777, "grad_norm": 5.268602558922981, "learning_rate": 3.991842126667229e-05, "loss": 2.2549, "step": 619 }, { "epoch": 0.048270157169967395, "grad_norm": 5.572952730909846, "learning_rate": 3.991797024168512e-05, "loss": 2.4452, "step": 620 }, { "epoch": 0.048348012262177026, "grad_norm": 4.564657454656384, "learning_rate": 3.991751797590202e-05, "loss": 2.3146, "step": 621 }, { "epoch": 0.04842586735438665, "grad_norm": 5.211395347863516, "learning_rate": 3.9917064469351174e-05, "loss": 2.403, "step": 622 }, { "epoch": 0.04850372244659627, "grad_norm": 5.010702646450692, "learning_rate": 3.991660972206083e-05, "loss": 2.252, "step": 623 }, { "epoch": 0.048581577538805897, "grad_norm": 4.618214499026103, "learning_rate": 3.991615373405932e-05, "loss": 2.2295, "step": 624 }, { "epoch": 0.04865943263101552, "grad_norm": 5.116452538579135, "learning_rate": 3.991569650537504e-05, "loss": 2.2722, "step": 625 }, { "epoch": 0.04873728772322515, "grad_norm": 4.708441700926099, "learning_rate": 3.9915238036036486e-05, "loss": 2.2895, "step": 626 }, { "epoch": 0.048815142815434774, "grad_norm": 4.438249063255146, "learning_rate": 3.9914778326072203e-05, "loss": 2.3203, "step": 627 }, { "epoch": 0.0488929979076444, "grad_norm": 4.005906800723373, "learning_rate": 3.991431737551085e-05, "loss": 2.3301, "step": 628 }, { "epoch": 0.04897085299985402, "grad_norm": 4.941878772961862, "learning_rate": 3.991385518438112e-05, "loss": 2.2985, "step": 629 }, { "epoch": 0.049048708092063645, "grad_norm": 4.552074996954469, "learning_rate": 3.9913391752711815e-05, "loss": 2.5121, "step": 630 }, { "epoch": 0.04912656318427327, "grad_norm": 4.7495144102629965, "learning_rate": 3.9912927080531816e-05, "loss": 2.1914, "step": 631 }, { "epoch": 0.0492044182764829, "grad_norm": 4.035744575630287, "learning_rate": 3.991246116787005e-05, "loss": 2.1893, "step": 632 }, { "epoch": 0.04928227336869252, "grad_norm": 4.256800983221637, "learning_rate": 3.9911994014755555e-05, "loss": 2.2677, "step": 633 }, { "epoch": 0.049360128460902146, "grad_norm": 5.099125744317862, "learning_rate": 3.991152562121743e-05, "loss": 2.3391, "step": 634 }, { "epoch": 0.04943798355311177, "grad_norm": 4.2999987629107475, "learning_rate": 3.991105598728484e-05, "loss": 2.3327, "step": 635 }, { "epoch": 0.04951583864532139, "grad_norm": 4.595278138402136, "learning_rate": 3.9910585112987056e-05, "loss": 2.3267, "step": 636 }, { "epoch": 0.049593693737531024, "grad_norm": 5.419520556605844, "learning_rate": 3.9910112998353414e-05, "loss": 2.3073, "step": 637 }, { "epoch": 0.04967154882974065, "grad_norm": 5.223210098379064, "learning_rate": 3.9909639643413316e-05, "loss": 2.179, "step": 638 }, { "epoch": 0.04974940392195027, "grad_norm": 4.711052978819045, "learning_rate": 3.990916504819625e-05, "loss": 2.3192, "step": 639 }, { "epoch": 0.049827259014159894, "grad_norm": 4.222523775777126, "learning_rate": 3.990868921273178e-05, "loss": 2.3523, "step": 640 }, { "epoch": 0.04990511410636952, "grad_norm": 4.555625392146938, "learning_rate": 3.9908212137049556e-05, "loss": 2.305, "step": 641 }, { "epoch": 0.04998296919857914, "grad_norm": 5.37436991249181, "learning_rate": 3.990773382117929e-05, "loss": 2.2747, "step": 642 }, { "epoch": 0.05006082429078877, "grad_norm": 4.175487377030438, "learning_rate": 3.990725426515078e-05, "loss": 2.2663, "step": 643 }, { "epoch": 0.050138679382998395, "grad_norm": 4.0918755780204, "learning_rate": 3.9906773468993905e-05, "loss": 2.2388, "step": 644 }, { "epoch": 0.05021653447520802, "grad_norm": 4.753356982726, "learning_rate": 3.9906291432738606e-05, "loss": 2.2842, "step": 645 }, { "epoch": 0.05029438956741764, "grad_norm": 4.991938304601226, "learning_rate": 3.990580815641493e-05, "loss": 2.2554, "step": 646 }, { "epoch": 0.050372244659627266, "grad_norm": 3.8413920796003818, "learning_rate": 3.990532364005297e-05, "loss": 2.1851, "step": 647 }, { "epoch": 0.0504500997518369, "grad_norm": 5.042487869594479, "learning_rate": 3.99048378836829e-05, "loss": 2.3227, "step": 648 }, { "epoch": 0.05052795484404652, "grad_norm": 4.501901450439396, "learning_rate": 3.9904350887335e-05, "loss": 2.3004, "step": 649 }, { "epoch": 0.050605809936256144, "grad_norm": 5.513054562400171, "learning_rate": 3.99038626510396e-05, "loss": 2.3562, "step": 650 }, { "epoch": 0.050605809936256144, "eval_loss": 0.28696444630622864, "eval_runtime": 162.2541, "eval_samples_per_second": 17.75, "eval_steps_per_second": 0.635, "step": 650 }, { "epoch": 0.05068366502846577, "grad_norm": 4.001009511253755, "learning_rate": 3.99033731748271e-05, "loss": 2.168, "step": 651 }, { "epoch": 0.05076152012067539, "grad_norm": 5.197275634069619, "learning_rate": 3.990288245872802e-05, "loss": 2.2436, "step": 652 }, { "epoch": 0.050839375212885014, "grad_norm": 4.947464383354322, "learning_rate": 3.9902390502772905e-05, "loss": 2.3337, "step": 653 }, { "epoch": 0.050917230305094645, "grad_norm": 4.6684245317284425, "learning_rate": 3.990189730699242e-05, "loss": 2.2343, "step": 654 }, { "epoch": 0.05099508539730427, "grad_norm": 3.921733747908643, "learning_rate": 3.990140287141727e-05, "loss": 2.1748, "step": 655 }, { "epoch": 0.05107294048951389, "grad_norm": 5.033652294205518, "learning_rate": 3.9900907196078277e-05, "loss": 2.242, "step": 656 }, { "epoch": 0.051150795581723515, "grad_norm": 4.811324003139378, "learning_rate": 3.990041028100629e-05, "loss": 2.2612, "step": 657 }, { "epoch": 0.05122865067393314, "grad_norm": 4.610959511133895, "learning_rate": 3.9899912126232304e-05, "loss": 2.1935, "step": 658 }, { "epoch": 0.05130650576614277, "grad_norm": 5.069650836942429, "learning_rate": 3.989941273178732e-05, "loss": 2.2332, "step": 659 }, { "epoch": 0.05138436085835239, "grad_norm": 4.462045926938421, "learning_rate": 3.989891209770246e-05, "loss": 2.2845, "step": 660 }, { "epoch": 0.05146221595056202, "grad_norm": 4.3491225604035915, "learning_rate": 3.9898410224008906e-05, "loss": 2.3386, "step": 661 }, { "epoch": 0.05154007104277164, "grad_norm": 4.880283490078998, "learning_rate": 3.989790711073793e-05, "loss": 2.2126, "step": 662 }, { "epoch": 0.051617926134981264, "grad_norm": 4.688702608244195, "learning_rate": 3.989740275792087e-05, "loss": 2.3065, "step": 663 }, { "epoch": 0.051695781227190894, "grad_norm": 4.240549723032037, "learning_rate": 3.989689716558914e-05, "loss": 2.1985, "step": 664 }, { "epoch": 0.05177363631940052, "grad_norm": 4.6513820237341985, "learning_rate": 3.989639033377424e-05, "loss": 2.228, "step": 665 }, { "epoch": 0.05185149141161014, "grad_norm": 3.7843082543804574, "learning_rate": 3.989588226250775e-05, "loss": 2.0063, "step": 666 }, { "epoch": 0.051929346503819765, "grad_norm": 4.404485298255786, "learning_rate": 3.989537295182131e-05, "loss": 2.2066, "step": 667 }, { "epoch": 0.05200720159602939, "grad_norm": 4.117886202125009, "learning_rate": 3.989486240174665e-05, "loss": 2.2111, "step": 668 }, { "epoch": 0.05208505668823901, "grad_norm": 4.308050556154436, "learning_rate": 3.9894350612315576e-05, "loss": 2.1309, "step": 669 }, { "epoch": 0.05216291178044864, "grad_norm": 4.339612928167423, "learning_rate": 3.9893837583559976e-05, "loss": 2.1964, "step": 670 }, { "epoch": 0.052240766872658266, "grad_norm": 5.3098241581517485, "learning_rate": 3.98933233155118e-05, "loss": 2.3058, "step": 671 }, { "epoch": 0.05231862196486789, "grad_norm": 4.304785409111912, "learning_rate": 3.989280780820309e-05, "loss": 2.216, "step": 672 }, { "epoch": 0.05239647705707751, "grad_norm": 3.9658508793572644, "learning_rate": 3.9892291061665956e-05, "loss": 2.3022, "step": 673 }, { "epoch": 0.05247433214928714, "grad_norm": 4.178020414608181, "learning_rate": 3.9891773075932596e-05, "loss": 2.1235, "step": 674 }, { "epoch": 0.05255218724149677, "grad_norm": 5.506132090787341, "learning_rate": 3.9891253851035266e-05, "loss": 2.1187, "step": 675 }, { "epoch": 0.05263004233370639, "grad_norm": 4.2714101520321845, "learning_rate": 3.9890733387006315e-05, "loss": 2.2645, "step": 676 }, { "epoch": 0.052707897425916014, "grad_norm": 3.9749691023107108, "learning_rate": 3.989021168387818e-05, "loss": 2.1584, "step": 677 }, { "epoch": 0.05278575251812564, "grad_norm": 5.520881274204705, "learning_rate": 3.9889688741683346e-05, "loss": 2.2192, "step": 678 }, { "epoch": 0.05286360761033526, "grad_norm": 5.183909897826718, "learning_rate": 3.9889164560454385e-05, "loss": 2.166, "step": 679 }, { "epoch": 0.052941462702544885, "grad_norm": 4.122351331986685, "learning_rate": 3.988863914022397e-05, "loss": 2.2073, "step": 680 }, { "epoch": 0.053019317794754516, "grad_norm": 4.769690108728985, "learning_rate": 3.9888112481024815e-05, "loss": 2.2761, "step": 681 }, { "epoch": 0.05309717288696414, "grad_norm": 4.788585608855957, "learning_rate": 3.988758458288974e-05, "loss": 2.1922, "step": 682 }, { "epoch": 0.05317502797917376, "grad_norm": 3.8854463566467072, "learning_rate": 3.988705544585162e-05, "loss": 2.2503, "step": 683 }, { "epoch": 0.053252883071383386, "grad_norm": 5.9054019730586305, "learning_rate": 3.988652506994343e-05, "loss": 2.1833, "step": 684 }, { "epoch": 0.05333073816359301, "grad_norm": 4.490550822894565, "learning_rate": 3.9885993455198194e-05, "loss": 2.2224, "step": 685 }, { "epoch": 0.05340859325580264, "grad_norm": 4.642565036910719, "learning_rate": 3.988546060164904e-05, "loss": 2.1897, "step": 686 }, { "epoch": 0.053486448348012264, "grad_norm": 4.5458002035822265, "learning_rate": 3.9884926509329164e-05, "loss": 2.1787, "step": 687 }, { "epoch": 0.05356430344022189, "grad_norm": 4.5211069230490395, "learning_rate": 3.9884391178271835e-05, "loss": 2.1762, "step": 688 }, { "epoch": 0.05364215853243151, "grad_norm": 4.004286874430579, "learning_rate": 3.9883854608510394e-05, "loss": 2.1868, "step": 689 }, { "epoch": 0.053720013624641134, "grad_norm": 4.6527807132184416, "learning_rate": 3.988331680007828e-05, "loss": 2.3107, "step": 690 }, { "epoch": 0.053797868716850765, "grad_norm": 4.204410993205545, "learning_rate": 3.9882777753008984e-05, "loss": 2.1565, "step": 691 }, { "epoch": 0.05387572380906039, "grad_norm": 4.1816504750094685, "learning_rate": 3.9882237467336094e-05, "loss": 2.0883, "step": 692 }, { "epoch": 0.05395357890127001, "grad_norm": 4.6308972842065685, "learning_rate": 3.988169594309327e-05, "loss": 2.1818, "step": 693 }, { "epoch": 0.054031433993479636, "grad_norm": 4.710302182273478, "learning_rate": 3.988115318031423e-05, "loss": 2.1028, "step": 694 }, { "epoch": 0.05410928908568926, "grad_norm": 4.120008799191054, "learning_rate": 3.9880609179032804e-05, "loss": 2.0431, "step": 695 }, { "epoch": 0.05418714417789888, "grad_norm": 3.8800977575009052, "learning_rate": 3.988006393928287e-05, "loss": 2.161, "step": 696 }, { "epoch": 0.05426499927010851, "grad_norm": 3.997552923285165, "learning_rate": 3.98795174610984e-05, "loss": 2.1299, "step": 697 }, { "epoch": 0.05434285436231814, "grad_norm": 3.888523835947091, "learning_rate": 3.987896974451343e-05, "loss": 2.1267, "step": 698 }, { "epoch": 0.05442070945452776, "grad_norm": 4.455579617654663, "learning_rate": 3.987842078956209e-05, "loss": 2.0452, "step": 699 }, { "epoch": 0.054498564546737384, "grad_norm": 4.522000559038536, "learning_rate": 3.9877870596278565e-05, "loss": 2.1742, "step": 700 }, { "epoch": 0.054498564546737384, "eval_loss": 0.27294379472732544, "eval_runtime": 162.2779, "eval_samples_per_second": 17.747, "eval_steps_per_second": 0.635, "step": 700 }, { "epoch": 0.05457641963894701, "grad_norm": 4.784102969306616, "learning_rate": 3.987731916469714e-05, "loss": 2.0912, "step": 701 }, { "epoch": 0.05465427473115664, "grad_norm": 4.752890509583225, "learning_rate": 3.987676649485216e-05, "loss": 2.1232, "step": 702 }, { "epoch": 0.05473212982336626, "grad_norm": 4.2387498776029195, "learning_rate": 3.9876212586778065e-05, "loss": 2.213, "step": 703 }, { "epoch": 0.054809984915575885, "grad_norm": 4.7229611471639235, "learning_rate": 3.987565744050935e-05, "loss": 2.281, "step": 704 }, { "epoch": 0.05488784000778551, "grad_norm": 4.677030136110006, "learning_rate": 3.987510105608059e-05, "loss": 2.1677, "step": 705 }, { "epoch": 0.05496569509999513, "grad_norm": 5.7010211974022145, "learning_rate": 3.987454343352646e-05, "loss": 2.25, "step": 706 }, { "epoch": 0.055043550192204756, "grad_norm": 3.7174250626281995, "learning_rate": 3.9873984572881695e-05, "loss": 2.1467, "step": 707 }, { "epoch": 0.055121405284414386, "grad_norm": 3.8618588480224907, "learning_rate": 3.987342447418111e-05, "loss": 2.0797, "step": 708 }, { "epoch": 0.05519926037662401, "grad_norm": 3.982445806967193, "learning_rate": 3.987286313745959e-05, "loss": 2.1109, "step": 709 }, { "epoch": 0.05527711546883363, "grad_norm": 5.597861728463657, "learning_rate": 3.987230056275211e-05, "loss": 2.2056, "step": 710 }, { "epoch": 0.05535497056104326, "grad_norm": 4.510380773661309, "learning_rate": 3.987173675009371e-05, "loss": 2.2566, "step": 711 }, { "epoch": 0.05543282565325288, "grad_norm": 3.9177318329810857, "learning_rate": 3.987117169951951e-05, "loss": 2.0642, "step": 712 }, { "epoch": 0.05551068074546251, "grad_norm": 5.373781985555478, "learning_rate": 3.987060541106473e-05, "loss": 2.1466, "step": 713 }, { "epoch": 0.055588535837672134, "grad_norm": 4.357649059661185, "learning_rate": 3.987003788476462e-05, "loss": 2.2461, "step": 714 }, { "epoch": 0.05566639092988176, "grad_norm": 4.554418311912257, "learning_rate": 3.9869469120654555e-05, "loss": 2.2224, "step": 715 }, { "epoch": 0.05574424602209138, "grad_norm": 4.813818115859607, "learning_rate": 3.9868899118769955e-05, "loss": 2.0991, "step": 716 }, { "epoch": 0.055822101114301005, "grad_norm": 4.6857952625531185, "learning_rate": 3.986832787914633e-05, "loss": 2.0749, "step": 717 }, { "epoch": 0.05589995620651063, "grad_norm": 4.139025566020366, "learning_rate": 3.986775540181927e-05, "loss": 2.0427, "step": 718 }, { "epoch": 0.05597781129872026, "grad_norm": 4.055059685956901, "learning_rate": 3.986718168682444e-05, "loss": 2.0717, "step": 719 }, { "epoch": 0.05605566639092988, "grad_norm": 4.3909938670658635, "learning_rate": 3.9866606734197566e-05, "loss": 2.0741, "step": 720 }, { "epoch": 0.056133521483139506, "grad_norm": 3.8293534554564053, "learning_rate": 3.986603054397448e-05, "loss": 2.1325, "step": 721 }, { "epoch": 0.05621137657534913, "grad_norm": 4.869243708436137, "learning_rate": 3.986545311619107e-05, "loss": 2.0096, "step": 722 }, { "epoch": 0.05628923166755875, "grad_norm": 4.153576933022916, "learning_rate": 3.98648744508833e-05, "loss": 2.0338, "step": 723 }, { "epoch": 0.056367086759768384, "grad_norm": 4.0511460882816905, "learning_rate": 3.986429454808723e-05, "loss": 2.1935, "step": 724 }, { "epoch": 0.05644494185197801, "grad_norm": 4.821629336904303, "learning_rate": 3.986371340783898e-05, "loss": 2.1713, "step": 725 }, { "epoch": 0.05652279694418763, "grad_norm": 3.6763381183719477, "learning_rate": 3.9863131030174755e-05, "loss": 2.0339, "step": 726 }, { "epoch": 0.056600652036397255, "grad_norm": 3.5772664502411895, "learning_rate": 3.986254741513083e-05, "loss": 2.0118, "step": 727 }, { "epoch": 0.05667850712860688, "grad_norm": 4.295889423492009, "learning_rate": 3.9861962562743565e-05, "loss": 2.0271, "step": 728 }, { "epoch": 0.05675636222081651, "grad_norm": 4.5938578148384135, "learning_rate": 3.986137647304938e-05, "loss": 2.1975, "step": 729 }, { "epoch": 0.05683421731302613, "grad_norm": 3.803735634654407, "learning_rate": 3.986078914608481e-05, "loss": 2.0273, "step": 730 }, { "epoch": 0.056912072405235756, "grad_norm": 3.7111344222149447, "learning_rate": 3.986020058188643e-05, "loss": 1.9562, "step": 731 }, { "epoch": 0.05698992749744538, "grad_norm": 3.955792481425794, "learning_rate": 3.98596107804909e-05, "loss": 2.1007, "step": 732 }, { "epoch": 0.057067782589655, "grad_norm": 4.132763056511665, "learning_rate": 3.985901974193497e-05, "loss": 2.009, "step": 733 }, { "epoch": 0.057145637681864626, "grad_norm": 4.463050502565755, "learning_rate": 3.985842746625545e-05, "loss": 2.1593, "step": 734 }, { "epoch": 0.05722349277407426, "grad_norm": 4.649524744513995, "learning_rate": 3.985783395348925e-05, "loss": 2.1589, "step": 735 }, { "epoch": 0.05730134786628388, "grad_norm": 3.677952683099778, "learning_rate": 3.985723920367332e-05, "loss": 2.0439, "step": 736 }, { "epoch": 0.057379202958493504, "grad_norm": 3.6424175089426902, "learning_rate": 3.985664321684474e-05, "loss": 2.0804, "step": 737 }, { "epoch": 0.05745705805070313, "grad_norm": 4.1383605720374055, "learning_rate": 3.985604599304062e-05, "loss": 1.9604, "step": 738 }, { "epoch": 0.05753491314291275, "grad_norm": 3.937353528153364, "learning_rate": 3.9855447532298164e-05, "loss": 2.0808, "step": 739 }, { "epoch": 0.05761276823512238, "grad_norm": 3.498221511138684, "learning_rate": 3.985484783465465e-05, "loss": 2.0279, "step": 740 }, { "epoch": 0.057690623327332005, "grad_norm": 4.398976921341441, "learning_rate": 3.985424690014745e-05, "loss": 2.1662, "step": 741 }, { "epoch": 0.05776847841954163, "grad_norm": 3.892204298700843, "learning_rate": 3.985364472881399e-05, "loss": 1.9333, "step": 742 }, { "epoch": 0.05784633351175125, "grad_norm": 3.584769843369109, "learning_rate": 3.985304132069178e-05, "loss": 1.9494, "step": 743 }, { "epoch": 0.057924188603960876, "grad_norm": 4.4675830369065865, "learning_rate": 3.985243667581842e-05, "loss": 2.1109, "step": 744 }, { "epoch": 0.0580020436961705, "grad_norm": 4.726561118357985, "learning_rate": 3.985183079423157e-05, "loss": 2.1413, "step": 745 }, { "epoch": 0.05807989878838013, "grad_norm": 3.8229802004409996, "learning_rate": 3.985122367596896e-05, "loss": 2.0728, "step": 746 }, { "epoch": 0.05815775388058975, "grad_norm": 4.434759558802821, "learning_rate": 3.985061532106843e-05, "loss": 2.0844, "step": 747 }, { "epoch": 0.05823560897279938, "grad_norm": 4.2535329639324235, "learning_rate": 3.9850005729567875e-05, "loss": 2.0826, "step": 748 }, { "epoch": 0.058313464065009, "grad_norm": 3.7806010890167863, "learning_rate": 3.984939490150527e-05, "loss": 2.094, "step": 749 }, { "epoch": 0.058391319157218624, "grad_norm": 3.6481977140472326, "learning_rate": 3.9848782836918664e-05, "loss": 2.0183, "step": 750 }, { "epoch": 0.058391319157218624, "eval_loss": 0.26093655824661255, "eval_runtime": 162.2804, "eval_samples_per_second": 17.747, "eval_steps_per_second": 0.635, "step": 750 }, { "epoch": 0.058469174249428255, "grad_norm": 4.401392040814034, "learning_rate": 3.984816953584617e-05, "loss": 2.0901, "step": 751 }, { "epoch": 0.05854702934163788, "grad_norm": 4.300899796961501, "learning_rate": 3.9847554998326024e-05, "loss": 2.1265, "step": 752 }, { "epoch": 0.0586248844338475, "grad_norm": 3.913048428574186, "learning_rate": 3.984693922439648e-05, "loss": 2.0483, "step": 753 }, { "epoch": 0.058702739526057125, "grad_norm": 4.347778834542965, "learning_rate": 3.9846322214095924e-05, "loss": 2.1381, "step": 754 }, { "epoch": 0.05878059461826675, "grad_norm": 3.8344478126492274, "learning_rate": 3.984570396746277e-05, "loss": 1.9346, "step": 755 }, { "epoch": 0.05885844971047638, "grad_norm": 4.251549018162263, "learning_rate": 3.984508448453555e-05, "loss": 2.0914, "step": 756 }, { "epoch": 0.058936304802686, "grad_norm": 4.343968632633781, "learning_rate": 3.984446376535284e-05, "loss": 2.0436, "step": 757 }, { "epoch": 0.059014159894895626, "grad_norm": 4.2106260413577585, "learning_rate": 3.9843841809953326e-05, "loss": 2.0625, "step": 758 }, { "epoch": 0.05909201498710525, "grad_norm": 4.062178767315895, "learning_rate": 3.9843218618375726e-05, "loss": 2.0395, "step": 759 }, { "epoch": 0.059169870079314874, "grad_norm": 3.5655369013034934, "learning_rate": 3.984259419065889e-05, "loss": 2.0287, "step": 760 }, { "epoch": 0.0592477251715245, "grad_norm": 4.4437718931601635, "learning_rate": 3.9841968526841694e-05, "loss": 2.0284, "step": 761 }, { "epoch": 0.05932558026373413, "grad_norm": 4.066887040467909, "learning_rate": 3.984134162696313e-05, "loss": 2.0578, "step": 762 }, { "epoch": 0.05940343535594375, "grad_norm": 3.9009529735987676, "learning_rate": 3.9840713491062244e-05, "loss": 2.0491, "step": 763 }, { "epoch": 0.059481290448153375, "grad_norm": 4.305845766181595, "learning_rate": 3.984008411917817e-05, "loss": 2.1175, "step": 764 }, { "epoch": 0.059559145540363, "grad_norm": 4.467115763240737, "learning_rate": 3.9839453511350106e-05, "loss": 2.0657, "step": 765 }, { "epoch": 0.05963700063257262, "grad_norm": 3.7802138601188666, "learning_rate": 3.983882166761735e-05, "loss": 2.0991, "step": 766 }, { "epoch": 0.05971485572478225, "grad_norm": 3.944077262083219, "learning_rate": 3.983818858801925e-05, "loss": 2.0529, "step": 767 }, { "epoch": 0.059792710816991876, "grad_norm": 3.846556355791988, "learning_rate": 3.983755427259525e-05, "loss": 1.9452, "step": 768 }, { "epoch": 0.0598705659092015, "grad_norm": 3.9096763910057803, "learning_rate": 3.983691872138485e-05, "loss": 2.0022, "step": 769 }, { "epoch": 0.05994842100141112, "grad_norm": 3.9556580995549018, "learning_rate": 3.983628193442768e-05, "loss": 2.0625, "step": 770 }, { "epoch": 0.060026276093620747, "grad_norm": 4.245119783304341, "learning_rate": 3.983564391176337e-05, "loss": 1.9741, "step": 771 }, { "epoch": 0.06010413118583037, "grad_norm": 3.9391028254568243, "learning_rate": 3.983500465343168e-05, "loss": 1.9474, "step": 772 }, { "epoch": 0.06018198627804, "grad_norm": 3.6134423946651633, "learning_rate": 3.983436415947243e-05, "loss": 1.9778, "step": 773 }, { "epoch": 0.060259841370249624, "grad_norm": 3.9659193531803716, "learning_rate": 3.9833722429925536e-05, "loss": 1.9394, "step": 774 }, { "epoch": 0.06033769646245925, "grad_norm": 4.20695072210683, "learning_rate": 3.983307946483095e-05, "loss": 2.0529, "step": 775 }, { "epoch": 0.06041555155466887, "grad_norm": 3.668470139401913, "learning_rate": 3.983243526422874e-05, "loss": 2.008, "step": 776 }, { "epoch": 0.060493406646878495, "grad_norm": 3.537924178795959, "learning_rate": 3.983178982815903e-05, "loss": 1.9763, "step": 777 }, { "epoch": 0.060571261739088125, "grad_norm": 3.9712753662087734, "learning_rate": 3.983114315666204e-05, "loss": 2.1268, "step": 778 }, { "epoch": 0.06064911683129775, "grad_norm": 3.6289539472466537, "learning_rate": 3.983049524977804e-05, "loss": 1.9694, "step": 779 }, { "epoch": 0.06072697192350737, "grad_norm": 4.106253175260417, "learning_rate": 3.98298461075474e-05, "loss": 1.9556, "step": 780 }, { "epoch": 0.060804827015716996, "grad_norm": 3.698494941139467, "learning_rate": 3.9829195730010556e-05, "loss": 2.0237, "step": 781 }, { "epoch": 0.06088268210792662, "grad_norm": 4.023262637590564, "learning_rate": 3.982854411720802e-05, "loss": 2.0546, "step": 782 }, { "epoch": 0.06096053720013624, "grad_norm": 4.346824859828715, "learning_rate": 3.98278912691804e-05, "loss": 2.0799, "step": 783 }, { "epoch": 0.061038392292345874, "grad_norm": 3.587032957251621, "learning_rate": 3.982723718596833e-05, "loss": 1.9581, "step": 784 }, { "epoch": 0.0611162473845555, "grad_norm": 3.486568451053632, "learning_rate": 3.98265818676126e-05, "loss": 2.0855, "step": 785 }, { "epoch": 0.06119410247676512, "grad_norm": 4.357065053377913, "learning_rate": 3.9825925314154006e-05, "loss": 2.2331, "step": 786 }, { "epoch": 0.061271957568974744, "grad_norm": 4.773433116503465, "learning_rate": 3.9825267525633454e-05, "loss": 2.0432, "step": 787 }, { "epoch": 0.06134981266118437, "grad_norm": 3.163121547397547, "learning_rate": 3.982460850209192e-05, "loss": 2.0322, "step": 788 }, { "epoch": 0.061427667753394, "grad_norm": 3.363353077894021, "learning_rate": 3.9823948243570465e-05, "loss": 1.9807, "step": 789 }, { "epoch": 0.06150552284560362, "grad_norm": 4.007970214095152, "learning_rate": 3.982328675011022e-05, "loss": 1.9997, "step": 790 }, { "epoch": 0.061583377937813245, "grad_norm": 3.8371703549907537, "learning_rate": 3.9822624021752375e-05, "loss": 1.9568, "step": 791 }, { "epoch": 0.06166123303002287, "grad_norm": 3.9735477385321647, "learning_rate": 3.982196005853823e-05, "loss": 1.929, "step": 792 }, { "epoch": 0.06173908812223249, "grad_norm": 3.887139149597649, "learning_rate": 3.982129486050915e-05, "loss": 1.915, "step": 793 }, { "epoch": 0.06181694321444212, "grad_norm": 4.1702339399532615, "learning_rate": 3.982062842770657e-05, "loss": 2.054, "step": 794 }, { "epoch": 0.06189479830665175, "grad_norm": 4.004192763858405, "learning_rate": 3.9819960760172e-05, "loss": 2.0176, "step": 795 }, { "epoch": 0.06197265339886137, "grad_norm": 3.5760531568883014, "learning_rate": 3.981929185794704e-05, "loss": 1.9916, "step": 796 }, { "epoch": 0.062050508491070994, "grad_norm": 4.108869954960854, "learning_rate": 3.981862172107336e-05, "loss": 1.9592, "step": 797 }, { "epoch": 0.06212836358328062, "grad_norm": 3.9646583546001155, "learning_rate": 3.981795034959269e-05, "loss": 1.9241, "step": 798 }, { "epoch": 0.06220621867549024, "grad_norm": 3.479864366868761, "learning_rate": 3.981727774354688e-05, "loss": 2.0259, "step": 799 }, { "epoch": 0.06228407376769987, "grad_norm": 3.7843071385484945, "learning_rate": 3.98166039029778e-05, "loss": 1.9353, "step": 800 }, { "epoch": 0.06228407376769987, "eval_loss": 0.25322023034095764, "eval_runtime": 162.5686, "eval_samples_per_second": 17.716, "eval_steps_per_second": 0.634, "step": 800 }, { "epoch": 0.062361928859909495, "grad_norm": 4.699899848030563, "learning_rate": 3.981592882792746e-05, "loss": 1.9361, "step": 801 }, { "epoch": 0.06243978395211912, "grad_norm": 3.896177307142385, "learning_rate": 3.9815252518437886e-05, "loss": 2.0, "step": 802 }, { "epoch": 0.06251763904432875, "grad_norm": 3.3845605008707658, "learning_rate": 3.981457497455122e-05, "loss": 1.9992, "step": 803 }, { "epoch": 0.06259549413653837, "grad_norm": 3.802109242670562, "learning_rate": 3.9813896196309675e-05, "loss": 1.8782, "step": 804 }, { "epoch": 0.062673349228748, "grad_norm": 3.5225429870069425, "learning_rate": 3.981321618375553e-05, "loss": 1.8991, "step": 805 }, { "epoch": 0.06275120432095761, "grad_norm": 3.6233208911480403, "learning_rate": 3.981253493693114e-05, "loss": 1.9881, "step": 806 }, { "epoch": 0.06282905941316724, "grad_norm": 3.3731558948824407, "learning_rate": 3.981185245587896e-05, "loss": 1.8478, "step": 807 }, { "epoch": 0.06290691450537687, "grad_norm": 3.5490865289798723, "learning_rate": 3.98111687406415e-05, "loss": 2.0315, "step": 808 }, { "epoch": 0.06298476959758649, "grad_norm": 3.976413715584318, "learning_rate": 3.981048379126133e-05, "loss": 1.9976, "step": 809 }, { "epoch": 0.06306262468979612, "grad_norm": 4.081386707030724, "learning_rate": 3.980979760778115e-05, "loss": 1.9978, "step": 810 }, { "epoch": 0.06314047978200574, "grad_norm": 3.7230036516309943, "learning_rate": 3.980911019024369e-05, "loss": 1.9991, "step": 811 }, { "epoch": 0.06321833487421537, "grad_norm": 4.237649154577667, "learning_rate": 3.980842153869178e-05, "loss": 1.9058, "step": 812 }, { "epoch": 0.063296189966425, "grad_norm": 4.076451660689898, "learning_rate": 3.980773165316831e-05, "loss": 1.9624, "step": 813 }, { "epoch": 0.06337404505863461, "grad_norm": 4.173265868330753, "learning_rate": 3.980704053371627e-05, "loss": 2.0611, "step": 814 }, { "epoch": 0.06345190015084425, "grad_norm": 4.0504847968411335, "learning_rate": 3.98063481803787e-05, "loss": 2.0027, "step": 815 }, { "epoch": 0.06352975524305386, "grad_norm": 3.861938496874526, "learning_rate": 3.980565459319873e-05, "loss": 1.9682, "step": 816 }, { "epoch": 0.06360761033526349, "grad_norm": 4.1297011857615, "learning_rate": 3.980495977221959e-05, "loss": 2.0064, "step": 817 }, { "epoch": 0.06368546542747311, "grad_norm": 3.8536000112188065, "learning_rate": 3.9804263717484537e-05, "loss": 1.9754, "step": 818 }, { "epoch": 0.06376332051968274, "grad_norm": 3.3866477318475043, "learning_rate": 3.9803566429036946e-05, "loss": 2.0573, "step": 819 }, { "epoch": 0.06384117561189237, "grad_norm": 4.036691251025716, "learning_rate": 3.980286790692025e-05, "loss": 1.8903, "step": 820 }, { "epoch": 0.06391903070410199, "grad_norm": 4.263233384592717, "learning_rate": 3.980216815117797e-05, "loss": 1.9584, "step": 821 }, { "epoch": 0.06399688579631162, "grad_norm": 4.148702081502497, "learning_rate": 3.980146716185369e-05, "loss": 2.0347, "step": 822 }, { "epoch": 0.06407474088852123, "grad_norm": 3.7813666203304948, "learning_rate": 3.9800764938991074e-05, "loss": 2.0152, "step": 823 }, { "epoch": 0.06415259598073086, "grad_norm": 4.344446455622084, "learning_rate": 3.980006148263388e-05, "loss": 2.0106, "step": 824 }, { "epoch": 0.0642304510729405, "grad_norm": 4.0679660356155525, "learning_rate": 3.9799356792825925e-05, "loss": 2.0385, "step": 825 }, { "epoch": 0.06430830616515011, "grad_norm": 3.4966874667733334, "learning_rate": 3.9798650869611104e-05, "loss": 1.8894, "step": 826 }, { "epoch": 0.06438616125735974, "grad_norm": 3.7116624730571637, "learning_rate": 3.97979437130334e-05, "loss": 1.9634, "step": 827 }, { "epoch": 0.06446401634956936, "grad_norm": 4.271148877014679, "learning_rate": 3.979723532313685e-05, "loss": 1.8979, "step": 828 }, { "epoch": 0.06454187144177899, "grad_norm": 4.10736344321954, "learning_rate": 3.9796525699965604e-05, "loss": 1.9688, "step": 829 }, { "epoch": 0.06461972653398862, "grad_norm": 3.929900409889936, "learning_rate": 3.979581484356386e-05, "loss": 1.8903, "step": 830 }, { "epoch": 0.06469758162619824, "grad_norm": 4.25658068211986, "learning_rate": 3.9795102753975895e-05, "loss": 2.0345, "step": 831 }, { "epoch": 0.06477543671840787, "grad_norm": 3.7458554269398423, "learning_rate": 3.979438943124608e-05, "loss": 1.935, "step": 832 }, { "epoch": 0.06485329181061748, "grad_norm": 4.170879657072395, "learning_rate": 3.9793674875418845e-05, "loss": 1.9509, "step": 833 }, { "epoch": 0.06493114690282711, "grad_norm": 3.383240761307561, "learning_rate": 3.979295908653869e-05, "loss": 1.9166, "step": 834 }, { "epoch": 0.06500900199503674, "grad_norm": 3.5846803175314874, "learning_rate": 3.979224206465023e-05, "loss": 1.9255, "step": 835 }, { "epoch": 0.06508685708724636, "grad_norm": 4.064127768309124, "learning_rate": 3.979152380979813e-05, "loss": 1.9737, "step": 836 }, { "epoch": 0.06516471217945599, "grad_norm": 3.1583060979995903, "learning_rate": 3.979080432202711e-05, "loss": 1.8749, "step": 837 }, { "epoch": 0.06524256727166561, "grad_norm": 3.7656938041494774, "learning_rate": 3.979008360138201e-05, "loss": 1.8465, "step": 838 }, { "epoch": 0.06532042236387524, "grad_norm": 4.076279774204517, "learning_rate": 3.978936164790773e-05, "loss": 1.9367, "step": 839 }, { "epoch": 0.06539827745608487, "grad_norm": 3.279037362300912, "learning_rate": 3.978863846164923e-05, "loss": 1.7509, "step": 840 }, { "epoch": 0.06547613254829449, "grad_norm": 3.8246348186300003, "learning_rate": 3.978791404265157e-05, "loss": 1.9826, "step": 841 }, { "epoch": 0.06555398764050412, "grad_norm": 4.573370437508255, "learning_rate": 3.978718839095987e-05, "loss": 1.9434, "step": 842 }, { "epoch": 0.06563184273271373, "grad_norm": 4.53824298879184, "learning_rate": 3.978646150661935e-05, "loss": 1.9114, "step": 843 }, { "epoch": 0.06570969782492336, "grad_norm": 2.98493185410714, "learning_rate": 3.978573338967529e-05, "loss": 1.7392, "step": 844 }, { "epoch": 0.06578755291713298, "grad_norm": 4.264922777589176, "learning_rate": 3.978500404017303e-05, "loss": 1.8966, "step": 845 }, { "epoch": 0.06586540800934261, "grad_norm": 3.6631193135665683, "learning_rate": 3.978427345815802e-05, "loss": 1.894, "step": 846 }, { "epoch": 0.06594326310155224, "grad_norm": 3.517764509238351, "learning_rate": 3.978354164367576e-05, "loss": 1.8153, "step": 847 }, { "epoch": 0.06602111819376186, "grad_norm": 4.18615864889936, "learning_rate": 3.9782808596771855e-05, "loss": 1.922, "step": 848 }, { "epoch": 0.06609897328597149, "grad_norm": 3.777130243683604, "learning_rate": 3.978207431749196e-05, "loss": 1.9682, "step": 849 }, { "epoch": 0.0661768283781811, "grad_norm": 3.610962538289517, "learning_rate": 3.978133880588182e-05, "loss": 1.9219, "step": 850 }, { "epoch": 0.0661768283781811, "eval_loss": 0.23948536813259125, "eval_runtime": 162.8393, "eval_samples_per_second": 17.686, "eval_steps_per_second": 0.633, "step": 850 }, { "epoch": 0.06625468347039074, "grad_norm": 3.9845922842394494, "learning_rate": 3.9780602061987256e-05, "loss": 1.9404, "step": 851 }, { "epoch": 0.06633253856260037, "grad_norm": 3.4212825294181615, "learning_rate": 3.977986408585415e-05, "loss": 1.8736, "step": 852 }, { "epoch": 0.06641039365480998, "grad_norm": 3.5397293235100387, "learning_rate": 3.977912487752849e-05, "loss": 1.8881, "step": 853 }, { "epoch": 0.06648824874701961, "grad_norm": 3.510015778458239, "learning_rate": 3.977838443705632e-05, "loss": 1.8513, "step": 854 }, { "epoch": 0.06656610383922923, "grad_norm": 3.3211217528761954, "learning_rate": 3.977764276448377e-05, "loss": 1.8814, "step": 855 }, { "epoch": 0.06664395893143886, "grad_norm": 3.461106767833892, "learning_rate": 3.977689985985703e-05, "loss": 1.8489, "step": 856 }, { "epoch": 0.06672181402364849, "grad_norm": 3.6770543659756, "learning_rate": 3.9776155723222396e-05, "loss": 1.916, "step": 857 }, { "epoch": 0.0667996691158581, "grad_norm": 3.461130571945196, "learning_rate": 3.977541035462622e-05, "loss": 1.9199, "step": 858 }, { "epoch": 0.06687752420806774, "grad_norm": 3.5418086570162854, "learning_rate": 3.977466375411492e-05, "loss": 1.894, "step": 859 }, { "epoch": 0.06695537930027735, "grad_norm": 3.359765542804778, "learning_rate": 3.977391592173502e-05, "loss": 1.8407, "step": 860 }, { "epoch": 0.06703323439248698, "grad_norm": 3.6833690703259068, "learning_rate": 3.9773166857533114e-05, "loss": 1.856, "step": 861 }, { "epoch": 0.06711108948469661, "grad_norm": 3.773538205705743, "learning_rate": 3.977241656155584e-05, "loss": 1.8316, "step": 862 }, { "epoch": 0.06718894457690623, "grad_norm": 3.4535913795340094, "learning_rate": 3.977166503384997e-05, "loss": 1.8874, "step": 863 }, { "epoch": 0.06726679966911586, "grad_norm": 3.455807893495306, "learning_rate": 3.97709122744623e-05, "loss": 1.8387, "step": 864 }, { "epoch": 0.06734465476132548, "grad_norm": 3.35385947278237, "learning_rate": 3.9770158283439714e-05, "loss": 1.933, "step": 865 }, { "epoch": 0.06742250985353511, "grad_norm": 3.3528992875373342, "learning_rate": 3.9769403060829204e-05, "loss": 1.927, "step": 866 }, { "epoch": 0.06750036494574473, "grad_norm": 3.2888360754326285, "learning_rate": 3.976864660667781e-05, "loss": 1.8599, "step": 867 }, { "epoch": 0.06757822003795436, "grad_norm": 3.37313576360631, "learning_rate": 3.976788892103264e-05, "loss": 1.8322, "step": 868 }, { "epoch": 0.06765607513016399, "grad_norm": 3.319610314350144, "learning_rate": 3.976713000394092e-05, "loss": 1.7738, "step": 869 }, { "epoch": 0.0677339302223736, "grad_norm": 3.446275060609828, "learning_rate": 3.9766369855449915e-05, "loss": 1.8685, "step": 870 }, { "epoch": 0.06781178531458323, "grad_norm": 3.4878066687227403, "learning_rate": 3.976560847560697e-05, "loss": 1.7842, "step": 871 }, { "epoch": 0.06788964040679285, "grad_norm": 3.921305343569305, "learning_rate": 3.976484586445953e-05, "loss": 1.9673, "step": 872 }, { "epoch": 0.06796749549900248, "grad_norm": 3.2940177336829595, "learning_rate": 3.9764082022055096e-05, "loss": 1.8309, "step": 873 }, { "epoch": 0.06804535059121211, "grad_norm": 3.7099559029437525, "learning_rate": 3.9763316948441246e-05, "loss": 1.9131, "step": 874 }, { "epoch": 0.06812320568342173, "grad_norm": 3.2946556523706945, "learning_rate": 3.976255064366565e-05, "loss": 1.8126, "step": 875 }, { "epoch": 0.06820106077563136, "grad_norm": 4.433915069228916, "learning_rate": 3.976178310777604e-05, "loss": 1.9302, "step": 876 }, { "epoch": 0.06827891586784098, "grad_norm": 3.634145419437029, "learning_rate": 3.976101434082023e-05, "loss": 1.9039, "step": 877 }, { "epoch": 0.0683567709600506, "grad_norm": 4.286951062939625, "learning_rate": 3.976024434284611e-05, "loss": 1.9056, "step": 878 }, { "epoch": 0.06843462605226024, "grad_norm": 3.2793256252773864, "learning_rate": 3.9759473113901656e-05, "loss": 1.8126, "step": 879 }, { "epoch": 0.06851248114446985, "grad_norm": 4.315040789302918, "learning_rate": 3.9758700654034894e-05, "loss": 2.0228, "step": 880 }, { "epoch": 0.06859033623667948, "grad_norm": 4.355536165075698, "learning_rate": 3.9757926963293964e-05, "loss": 1.8853, "step": 881 }, { "epoch": 0.0686681913288891, "grad_norm": 3.743910505163951, "learning_rate": 3.9757152041727054e-05, "loss": 1.9703, "step": 882 }, { "epoch": 0.06874604642109873, "grad_norm": 3.767296369430651, "learning_rate": 3.9756375889382436e-05, "loss": 1.9453, "step": 883 }, { "epoch": 0.06882390151330836, "grad_norm": 3.3044763096810206, "learning_rate": 3.9755598506308466e-05, "loss": 1.7623, "step": 884 }, { "epoch": 0.06890175660551798, "grad_norm": 3.584050876247748, "learning_rate": 3.975481989255356e-05, "loss": 1.8936, "step": 885 }, { "epoch": 0.06897961169772761, "grad_norm": 3.030105860789206, "learning_rate": 3.975404004816624e-05, "loss": 1.7612, "step": 886 }, { "epoch": 0.06905746678993722, "grad_norm": 3.6682377982369245, "learning_rate": 3.975325897319508e-05, "loss": 1.8463, "step": 887 }, { "epoch": 0.06913532188214686, "grad_norm": 3.456569888037046, "learning_rate": 3.975247666768873e-05, "loss": 1.9316, "step": 888 }, { "epoch": 0.06921317697435649, "grad_norm": 3.581726170736334, "learning_rate": 3.975169313169592e-05, "loss": 1.8978, "step": 889 }, { "epoch": 0.0692910320665661, "grad_norm": 4.027389653678286, "learning_rate": 3.975090836526548e-05, "loss": 1.9061, "step": 890 }, { "epoch": 0.06936888715877573, "grad_norm": 3.294364398320807, "learning_rate": 3.9750122368446285e-05, "loss": 1.8209, "step": 891 }, { "epoch": 0.06944674225098535, "grad_norm": 3.7298361248053356, "learning_rate": 3.9749335141287296e-05, "loss": 1.8211, "step": 892 }, { "epoch": 0.06952459734319498, "grad_norm": 3.3072025428644944, "learning_rate": 3.9748546683837555e-05, "loss": 1.7866, "step": 893 }, { "epoch": 0.0696024524354046, "grad_norm": 3.8534342160167543, "learning_rate": 3.974775699614619e-05, "loss": 1.814, "step": 894 }, { "epoch": 0.06968030752761423, "grad_norm": 3.363267006491431, "learning_rate": 3.9746966078262386e-05, "loss": 1.8319, "step": 895 }, { "epoch": 0.06975816261982386, "grad_norm": 3.5941224291498597, "learning_rate": 3.97461739302354e-05, "loss": 1.8027, "step": 896 }, { "epoch": 0.06983601771203347, "grad_norm": 3.361997368918986, "learning_rate": 3.9745380552114614e-05, "loss": 1.8338, "step": 897 }, { "epoch": 0.0699138728042431, "grad_norm": 3.44778579058675, "learning_rate": 3.974458594394942e-05, "loss": 1.7771, "step": 898 }, { "epoch": 0.06999172789645272, "grad_norm": 3.208923269764381, "learning_rate": 3.974379010578933e-05, "loss": 1.8359, "step": 899 }, { "epoch": 0.07006958298866235, "grad_norm": 3.4204932857942874, "learning_rate": 3.974299303768392e-05, "loss": 1.8218, "step": 900 }, { "epoch": 0.07006958298866235, "eval_loss": 0.23199251294136047, "eval_runtime": 162.1445, "eval_samples_per_second": 17.762, "eval_steps_per_second": 0.635, "step": 900 }, { "epoch": 0.07014743808087198, "grad_norm": 3.8732009114647226, "learning_rate": 3.974219473968285e-05, "loss": 1.8481, "step": 901 }, { "epoch": 0.0702252931730816, "grad_norm": 3.4329084959742833, "learning_rate": 3.974139521183584e-05, "loss": 1.9664, "step": 902 }, { "epoch": 0.07030314826529123, "grad_norm": 3.825239994271074, "learning_rate": 3.97405944541927e-05, "loss": 1.8204, "step": 903 }, { "epoch": 0.07038100335750085, "grad_norm": 3.992352049094025, "learning_rate": 3.973979246680331e-05, "loss": 1.8919, "step": 904 }, { "epoch": 0.07045885844971048, "grad_norm": 4.12211374074485, "learning_rate": 3.973898924971764e-05, "loss": 1.7921, "step": 905 }, { "epoch": 0.0705367135419201, "grad_norm": 3.602902643635529, "learning_rate": 3.9738184802985724e-05, "loss": 1.916, "step": 906 }, { "epoch": 0.07061456863412972, "grad_norm": 4.196152176040298, "learning_rate": 3.973737912665767e-05, "loss": 1.8435, "step": 907 }, { "epoch": 0.07069242372633935, "grad_norm": 3.6541466923651336, "learning_rate": 3.9736572220783676e-05, "loss": 1.9092, "step": 908 }, { "epoch": 0.07077027881854897, "grad_norm": 3.6100941212163358, "learning_rate": 3.9735764085414e-05, "loss": 1.7605, "step": 909 }, { "epoch": 0.0708481339107586, "grad_norm": 4.240552831013888, "learning_rate": 3.973495472059898e-05, "loss": 1.8598, "step": 910 }, { "epoch": 0.07092598900296823, "grad_norm": 3.461023940460826, "learning_rate": 3.973414412638905e-05, "loss": 1.7563, "step": 911 }, { "epoch": 0.07100384409517785, "grad_norm": 3.3660221895404097, "learning_rate": 3.973333230283469e-05, "loss": 1.8694, "step": 912 }, { "epoch": 0.07108169918738748, "grad_norm": 3.7015741150169412, "learning_rate": 3.973251924998649e-05, "loss": 1.8112, "step": 913 }, { "epoch": 0.0711595542795971, "grad_norm": 3.56912627536998, "learning_rate": 3.97317049678951e-05, "loss": 1.7041, "step": 914 }, { "epoch": 0.07123740937180673, "grad_norm": 3.1685918027732725, "learning_rate": 3.973088945661122e-05, "loss": 1.8115, "step": 915 }, { "epoch": 0.07131526446401636, "grad_norm": 3.6889209400692455, "learning_rate": 3.973007271618568e-05, "loss": 1.8571, "step": 916 }, { "epoch": 0.07139311955622597, "grad_norm": 3.68751776950163, "learning_rate": 3.9729254746669355e-05, "loss": 1.8562, "step": 917 }, { "epoch": 0.0714709746484356, "grad_norm": 3.4027034778578558, "learning_rate": 3.9728435548113184e-05, "loss": 1.7402, "step": 918 }, { "epoch": 0.07154882974064522, "grad_norm": 3.354079620240788, "learning_rate": 3.972761512056822e-05, "loss": 1.829, "step": 919 }, { "epoch": 0.07162668483285485, "grad_norm": 3.5598988846319717, "learning_rate": 3.972679346408556e-05, "loss": 1.8241, "step": 920 }, { "epoch": 0.07170453992506447, "grad_norm": 3.5494118747615095, "learning_rate": 3.9725970578716386e-05, "loss": 1.9014, "step": 921 }, { "epoch": 0.0717823950172741, "grad_norm": 3.093842456493267, "learning_rate": 3.972514646451196e-05, "loss": 1.8013, "step": 922 }, { "epoch": 0.07186025010948373, "grad_norm": 3.26809512942528, "learning_rate": 3.972432112152364e-05, "loss": 1.7513, "step": 923 }, { "epoch": 0.07193810520169334, "grad_norm": 3.9561570891005324, "learning_rate": 3.9723494549802815e-05, "loss": 1.8218, "step": 924 }, { "epoch": 0.07201596029390298, "grad_norm": 3.4030271766258324, "learning_rate": 3.9722666749400994e-05, "loss": 1.8574, "step": 925 }, { "epoch": 0.07209381538611259, "grad_norm": 3.293751057261914, "learning_rate": 3.972183772036974e-05, "loss": 1.8232, "step": 926 }, { "epoch": 0.07217167047832222, "grad_norm": 3.29730337221043, "learning_rate": 3.9721007462760696e-05, "loss": 1.7909, "step": 927 }, { "epoch": 0.07224952557053185, "grad_norm": 4.476011232270236, "learning_rate": 3.9720175976625584e-05, "loss": 1.885, "step": 928 }, { "epoch": 0.07232738066274147, "grad_norm": 3.4544609288612795, "learning_rate": 3.97193432620162e-05, "loss": 1.7981, "step": 929 }, { "epoch": 0.0724052357549511, "grad_norm": 3.0950864809658847, "learning_rate": 3.971850931898442e-05, "loss": 1.7464, "step": 930 }, { "epoch": 0.07248309084716072, "grad_norm": 3.165790331411462, "learning_rate": 3.971767414758219e-05, "loss": 1.805, "step": 931 }, { "epoch": 0.07256094593937035, "grad_norm": 3.742160398906861, "learning_rate": 3.971683774786155e-05, "loss": 1.7375, "step": 932 }, { "epoch": 0.07263880103157998, "grad_norm": 3.2979528610585516, "learning_rate": 3.9716000119874594e-05, "loss": 1.8485, "step": 933 }, { "epoch": 0.0727166561237896, "grad_norm": 3.375053631232677, "learning_rate": 3.97151612636735e-05, "loss": 1.7556, "step": 934 }, { "epoch": 0.07279451121599922, "grad_norm": 3.814950704707393, "learning_rate": 3.971432117931053e-05, "loss": 1.8503, "step": 935 }, { "epoch": 0.07287236630820884, "grad_norm": 3.1941248452021545, "learning_rate": 3.971347986683802e-05, "loss": 1.7773, "step": 936 }, { "epoch": 0.07295022140041847, "grad_norm": 3.4210940732948956, "learning_rate": 3.9712637326308375e-05, "loss": 1.7831, "step": 937 }, { "epoch": 0.0730280764926281, "grad_norm": 3.658857424917701, "learning_rate": 3.971179355777408e-05, "loss": 1.7899, "step": 938 }, { "epoch": 0.07310593158483772, "grad_norm": 3.3257256671284474, "learning_rate": 3.97109485612877e-05, "loss": 1.82, "step": 939 }, { "epoch": 0.07318378667704735, "grad_norm": 3.6675583839247565, "learning_rate": 3.971010233690187e-05, "loss": 1.8218, "step": 940 }, { "epoch": 0.07326164176925697, "grad_norm": 3.480154574737109, "learning_rate": 3.970925488466932e-05, "loss": 1.7492, "step": 941 }, { "epoch": 0.0733394968614666, "grad_norm": 4.514362747485951, "learning_rate": 3.970840620464283e-05, "loss": 1.8235, "step": 942 }, { "epoch": 0.07341735195367623, "grad_norm": 3.4967588646164818, "learning_rate": 3.970755629687526e-05, "loss": 1.8765, "step": 943 }, { "epoch": 0.07349520704588584, "grad_norm": 3.1369698525838317, "learning_rate": 3.970670516141958e-05, "loss": 1.7298, "step": 944 }, { "epoch": 0.07357306213809547, "grad_norm": 3.9474235015286108, "learning_rate": 3.9705852798328794e-05, "loss": 1.7333, "step": 945 }, { "epoch": 0.07365091723030509, "grad_norm": 3.6090127672736996, "learning_rate": 3.9704999207656006e-05, "loss": 1.8661, "step": 946 }, { "epoch": 0.07372877232251472, "grad_norm": 3.2451296157980964, "learning_rate": 3.970414438945438e-05, "loss": 1.7725, "step": 947 }, { "epoch": 0.07380662741472434, "grad_norm": 4.107264053473356, "learning_rate": 3.970328834377718e-05, "loss": 1.711, "step": 948 }, { "epoch": 0.07388448250693397, "grad_norm": 2.9236710303031055, "learning_rate": 3.9702431070677736e-05, "loss": 1.6558, "step": 949 }, { "epoch": 0.0739623375991436, "grad_norm": 3.417223890630896, "learning_rate": 3.970157257020944e-05, "loss": 1.8609, "step": 950 }, { "epoch": 0.0739623375991436, "eval_loss": 0.22510036826133728, "eval_runtime": 162.4028, "eval_samples_per_second": 17.734, "eval_steps_per_second": 0.634, "step": 950 }, { "epoch": 0.07404019269135322, "grad_norm": 4.200255228666137, "learning_rate": 3.970071284242578e-05, "loss": 1.7317, "step": 951 }, { "epoch": 0.07411804778356285, "grad_norm": 2.939499917301381, "learning_rate": 3.969985188738031e-05, "loss": 1.6835, "step": 952 }, { "epoch": 0.07419590287577246, "grad_norm": 4.139399924332297, "learning_rate": 3.969898970512667e-05, "loss": 1.8222, "step": 953 }, { "epoch": 0.0742737579679821, "grad_norm": 3.452464627871098, "learning_rate": 3.9698126295718565e-05, "loss": 1.8093, "step": 954 }, { "epoch": 0.07435161306019172, "grad_norm": 3.355114179439749, "learning_rate": 3.969726165920977e-05, "loss": 1.6463, "step": 955 }, { "epoch": 0.07442946815240134, "grad_norm": 3.601573523259798, "learning_rate": 3.969639579565417e-05, "loss": 1.7475, "step": 956 }, { "epoch": 0.07450732324461097, "grad_norm": 3.280638089830501, "learning_rate": 3.969552870510569e-05, "loss": 1.7421, "step": 957 }, { "epoch": 0.07458517833682059, "grad_norm": 3.43676435777304, "learning_rate": 3.9694660387618345e-05, "loss": 1.7974, "step": 958 }, { "epoch": 0.07466303342903022, "grad_norm": 3.389930678557845, "learning_rate": 3.969379084324624e-05, "loss": 1.7633, "step": 959 }, { "epoch": 0.07474088852123985, "grad_norm": 3.2990668810975676, "learning_rate": 3.969292007204353e-05, "loss": 1.7529, "step": 960 }, { "epoch": 0.07481874361344946, "grad_norm": 3.2711883635146064, "learning_rate": 3.969204807406446e-05, "loss": 1.8128, "step": 961 }, { "epoch": 0.0748965987056591, "grad_norm": 3.7775995839643066, "learning_rate": 3.969117484936336e-05, "loss": 1.8334, "step": 962 }, { "epoch": 0.07497445379786871, "grad_norm": 3.4204450459206033, "learning_rate": 3.969030039799462e-05, "loss": 1.7677, "step": 963 }, { "epoch": 0.07505230889007834, "grad_norm": 3.3904131827522543, "learning_rate": 3.9689424720012726e-05, "loss": 1.7567, "step": 964 }, { "epoch": 0.07513016398228797, "grad_norm": 3.749044027769672, "learning_rate": 3.9688547815472215e-05, "loss": 1.7412, "step": 965 }, { "epoch": 0.07520801907449759, "grad_norm": 3.136188246501114, "learning_rate": 3.968766968442772e-05, "loss": 1.7939, "step": 966 }, { "epoch": 0.07528587416670722, "grad_norm": 3.197643531327739, "learning_rate": 3.968679032693394e-05, "loss": 1.8636, "step": 967 }, { "epoch": 0.07536372925891684, "grad_norm": 3.378694301889872, "learning_rate": 3.9685909743045664e-05, "loss": 1.8227, "step": 968 }, { "epoch": 0.07544158435112647, "grad_norm": 3.4075850377701293, "learning_rate": 3.968502793281774e-05, "loss": 1.7854, "step": 969 }, { "epoch": 0.07551943944333608, "grad_norm": 3.314698362564512, "learning_rate": 3.968414489630511e-05, "loss": 1.6604, "step": 970 }, { "epoch": 0.07559729453554571, "grad_norm": 3.394070921050226, "learning_rate": 3.9683260633562766e-05, "loss": 1.7575, "step": 971 }, { "epoch": 0.07567514962775534, "grad_norm": 3.669463584806851, "learning_rate": 3.968237514464581e-05, "loss": 1.7436, "step": 972 }, { "epoch": 0.07575300471996496, "grad_norm": 3.2564946482405768, "learning_rate": 3.9681488429609395e-05, "loss": 1.7275, "step": 973 }, { "epoch": 0.07583085981217459, "grad_norm": 4.292843960488684, "learning_rate": 3.9680600488508764e-05, "loss": 1.678, "step": 974 }, { "epoch": 0.07590871490438421, "grad_norm": 3.773774136867133, "learning_rate": 3.967971132139923e-05, "loss": 1.7831, "step": 975 }, { "epoch": 0.07598656999659384, "grad_norm": 4.996015930100179, "learning_rate": 3.967882092833618e-05, "loss": 1.8756, "step": 976 }, { "epoch": 0.07606442508880347, "grad_norm": 3.472219082744472, "learning_rate": 3.967792930937509e-05, "loss": 1.8126, "step": 977 }, { "epoch": 0.07614228018101309, "grad_norm": 4.2920837431446754, "learning_rate": 3.9677036464571495e-05, "loss": 1.809, "step": 978 }, { "epoch": 0.07622013527322272, "grad_norm": 3.761428282573026, "learning_rate": 3.9676142393981024e-05, "loss": 1.7555, "step": 979 }, { "epoch": 0.07629799036543233, "grad_norm": 3.3214758692090034, "learning_rate": 3.967524709765936e-05, "loss": 1.7488, "step": 980 }, { "epoch": 0.07637584545764196, "grad_norm": 4.329369878159049, "learning_rate": 3.967435057566229e-05, "loss": 1.8407, "step": 981 }, { "epoch": 0.0764537005498516, "grad_norm": 3.178315828384281, "learning_rate": 3.9673452828045654e-05, "loss": 1.6955, "step": 982 }, { "epoch": 0.07653155564206121, "grad_norm": 3.256017024735255, "learning_rate": 3.967255385486538e-05, "loss": 1.8058, "step": 983 }, { "epoch": 0.07660941073427084, "grad_norm": 3.514016469673632, "learning_rate": 3.967165365617747e-05, "loss": 1.7377, "step": 984 }, { "epoch": 0.07668726582648046, "grad_norm": 3.2376363061728854, "learning_rate": 3.9670752232038e-05, "loss": 1.8005, "step": 985 }, { "epoch": 0.07676512091869009, "grad_norm": 3.257377191459909, "learning_rate": 3.966984958250313e-05, "loss": 1.7896, "step": 986 }, { "epoch": 0.07684297601089972, "grad_norm": 3.2031882518402, "learning_rate": 3.966894570762909e-05, "loss": 1.666, "step": 987 }, { "epoch": 0.07692083110310934, "grad_norm": 3.39855107574606, "learning_rate": 3.966804060747218e-05, "loss": 1.6834, "step": 988 }, { "epoch": 0.07699868619531897, "grad_norm": 3.530206215229486, "learning_rate": 3.966713428208879e-05, "loss": 1.7544, "step": 989 }, { "epoch": 0.07707654128752858, "grad_norm": 3.128556671805605, "learning_rate": 3.966622673153538e-05, "loss": 1.6732, "step": 990 }, { "epoch": 0.07715439637973821, "grad_norm": 3.104179011805843, "learning_rate": 3.966531795586848e-05, "loss": 1.6792, "step": 991 }, { "epoch": 0.07723225147194784, "grad_norm": 3.4146516059638463, "learning_rate": 3.966440795514471e-05, "loss": 1.7303, "step": 992 }, { "epoch": 0.07731010656415746, "grad_norm": 2.9650565323617952, "learning_rate": 3.966349672942075e-05, "loss": 1.6792, "step": 993 }, { "epoch": 0.07738796165636709, "grad_norm": 3.2189335427485912, "learning_rate": 3.966258427875337e-05, "loss": 1.7744, "step": 994 }, { "epoch": 0.07746581674857671, "grad_norm": 3.0618932243267905, "learning_rate": 3.966167060319942e-05, "loss": 1.7644, "step": 995 }, { "epoch": 0.07754367184078634, "grad_norm": 3.4514461667827683, "learning_rate": 3.9660755702815805e-05, "loss": 1.8141, "step": 996 }, { "epoch": 0.07762152693299595, "grad_norm": 3.4056214644398968, "learning_rate": 3.965983957765952e-05, "loss": 1.7019, "step": 997 }, { "epoch": 0.07769938202520559, "grad_norm": 3.6444413439777485, "learning_rate": 3.965892222778764e-05, "loss": 1.6733, "step": 998 }, { "epoch": 0.07777723711741522, "grad_norm": 3.3668049756000826, "learning_rate": 3.9658003653257315e-05, "loss": 1.7554, "step": 999 }, { "epoch": 0.07785509220962483, "grad_norm": 3.2887952978841675, "learning_rate": 3.965708385412576e-05, "loss": 1.7637, "step": 1000 }, { "epoch": 0.07785509220962483, "eval_loss": 0.21669529378414154, "eval_runtime": 162.5357, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.634, "step": 1000 }, { "epoch": 0.07793294730183446, "grad_norm": 3.7643484786389583, "learning_rate": 3.9656162830450276e-05, "loss": 1.7416, "step": 1001 }, { "epoch": 0.07801080239404408, "grad_norm": 3.605314290745246, "learning_rate": 3.965524058228824e-05, "loss": 1.7596, "step": 1002 }, { "epoch": 0.07808865748625371, "grad_norm": 3.506738868571613, "learning_rate": 3.9654317109697107e-05, "loss": 1.7903, "step": 1003 }, { "epoch": 0.07816651257846334, "grad_norm": 3.3997583262113054, "learning_rate": 3.96533924127344e-05, "loss": 1.7252, "step": 1004 }, { "epoch": 0.07824436767067296, "grad_norm": 3.185850203419185, "learning_rate": 3.9652466491457726e-05, "loss": 1.742, "step": 1005 }, { "epoch": 0.07832222276288259, "grad_norm": 3.851328142639493, "learning_rate": 3.9651539345924764e-05, "loss": 1.6998, "step": 1006 }, { "epoch": 0.0784000778550922, "grad_norm": 3.238587166483287, "learning_rate": 3.9650610976193264e-05, "loss": 1.6777, "step": 1007 }, { "epoch": 0.07847793294730183, "grad_norm": 3.500050942435741, "learning_rate": 3.9649681382321076e-05, "loss": 1.6454, "step": 1008 }, { "epoch": 0.07855578803951146, "grad_norm": 3.1076340249271914, "learning_rate": 3.9648750564366095e-05, "loss": 1.7571, "step": 1009 }, { "epoch": 0.07863364313172108, "grad_norm": 3.0836581020565292, "learning_rate": 3.964781852238631e-05, "loss": 1.6633, "step": 1010 }, { "epoch": 0.07871149822393071, "grad_norm": 3.3416254774245773, "learning_rate": 3.964688525643979e-05, "loss": 1.7001, "step": 1011 }, { "epoch": 0.07878935331614033, "grad_norm": 3.532866059911427, "learning_rate": 3.964595076658466e-05, "loss": 1.6885, "step": 1012 }, { "epoch": 0.07886720840834996, "grad_norm": 3.0314243553376907, "learning_rate": 3.964501505287914e-05, "loss": 1.5495, "step": 1013 }, { "epoch": 0.07894506350055959, "grad_norm": 4.373983668636802, "learning_rate": 3.964407811538153e-05, "loss": 1.7378, "step": 1014 }, { "epoch": 0.0790229185927692, "grad_norm": 3.3566371407603013, "learning_rate": 3.9643139954150184e-05, "loss": 1.7998, "step": 1015 }, { "epoch": 0.07910077368497884, "grad_norm": 4.031955162750704, "learning_rate": 3.964220056924355e-05, "loss": 1.7242, "step": 1016 }, { "epoch": 0.07917862877718845, "grad_norm": 3.5848908735061826, "learning_rate": 3.964125996072015e-05, "loss": 1.7056, "step": 1017 }, { "epoch": 0.07925648386939808, "grad_norm": 3.8271304765938017, "learning_rate": 3.964031812863857e-05, "loss": 1.8012, "step": 1018 }, { "epoch": 0.07933433896160771, "grad_norm": 3.534068003461563, "learning_rate": 3.9639375073057497e-05, "loss": 1.7441, "step": 1019 }, { "epoch": 0.07941219405381733, "grad_norm": 3.6589144400455376, "learning_rate": 3.9638430794035666e-05, "loss": 1.7713, "step": 1020 }, { "epoch": 0.07949004914602696, "grad_norm": 3.6254396107097993, "learning_rate": 3.96374852916319e-05, "loss": 1.7629, "step": 1021 }, { "epoch": 0.07956790423823658, "grad_norm": 2.9819776707725865, "learning_rate": 3.9636538565905114e-05, "loss": 1.696, "step": 1022 }, { "epoch": 0.07964575933044621, "grad_norm": 3.5077582103354947, "learning_rate": 3.9635590616914265e-05, "loss": 1.6669, "step": 1023 }, { "epoch": 0.07972361442265583, "grad_norm": 3.3347795834475558, "learning_rate": 3.963464144471843e-05, "loss": 1.7351, "step": 1024 }, { "epoch": 0.07980146951486546, "grad_norm": 3.3093368103960072, "learning_rate": 3.963369104937671e-05, "loss": 1.645, "step": 1025 }, { "epoch": 0.07987932460707509, "grad_norm": 3.2631347328841778, "learning_rate": 3.963273943094833e-05, "loss": 1.7594, "step": 1026 }, { "epoch": 0.0799571796992847, "grad_norm": 2.9244192850905093, "learning_rate": 3.963178658949256e-05, "loss": 1.6453, "step": 1027 }, { "epoch": 0.08003503479149433, "grad_norm": 3.1808415257323364, "learning_rate": 3.963083252506877e-05, "loss": 1.6656, "step": 1028 }, { "epoch": 0.08011288988370395, "grad_norm": 3.022243773560085, "learning_rate": 3.962987723773639e-05, "loss": 1.7051, "step": 1029 }, { "epoch": 0.08019074497591358, "grad_norm": 3.155070884098229, "learning_rate": 3.9628920727554915e-05, "loss": 1.7185, "step": 1030 }, { "epoch": 0.08026860006812321, "grad_norm": 3.480429304934681, "learning_rate": 3.9627962994583956e-05, "loss": 1.6802, "step": 1031 }, { "epoch": 0.08034645516033283, "grad_norm": 2.902045134366932, "learning_rate": 3.962700403888315e-05, "loss": 1.6966, "step": 1032 }, { "epoch": 0.08042431025254246, "grad_norm": 3.312796793333853, "learning_rate": 3.962604386051226e-05, "loss": 1.7022, "step": 1033 }, { "epoch": 0.08050216534475207, "grad_norm": 3.4506795523328577, "learning_rate": 3.962508245953108e-05, "loss": 1.7356, "step": 1034 }, { "epoch": 0.0805800204369617, "grad_norm": 3.3673934035444195, "learning_rate": 3.962411983599951e-05, "loss": 1.7675, "step": 1035 }, { "epoch": 0.08065787552917134, "grad_norm": 3.423688089109094, "learning_rate": 3.962315598997751e-05, "loss": 1.732, "step": 1036 }, { "epoch": 0.08073573062138095, "grad_norm": 3.130008197356371, "learning_rate": 3.962219092152514e-05, "loss": 1.6509, "step": 1037 }, { "epoch": 0.08081358571359058, "grad_norm": 3.1106513202327335, "learning_rate": 3.96212246307025e-05, "loss": 1.6659, "step": 1038 }, { "epoch": 0.0808914408058002, "grad_norm": 3.281108689560882, "learning_rate": 3.96202571175698e-05, "loss": 1.6873, "step": 1039 }, { "epoch": 0.08096929589800983, "grad_norm": 3.6277481247717884, "learning_rate": 3.961928838218729e-05, "loss": 1.6433, "step": 1040 }, { "epoch": 0.08104715099021946, "grad_norm": 3.4875553038593945, "learning_rate": 3.961831842461535e-05, "loss": 1.6453, "step": 1041 }, { "epoch": 0.08112500608242908, "grad_norm": 3.123202219100174, "learning_rate": 3.961734724491438e-05, "loss": 1.6273, "step": 1042 }, { "epoch": 0.08120286117463871, "grad_norm": 3.426719836584432, "learning_rate": 3.961637484314488e-05, "loss": 1.7235, "step": 1043 }, { "epoch": 0.08128071626684832, "grad_norm": 3.6050643897769947, "learning_rate": 3.961540121936744e-05, "loss": 1.6527, "step": 1044 }, { "epoch": 0.08135857135905795, "grad_norm": 3.186775378741877, "learning_rate": 3.96144263736427e-05, "loss": 1.639, "step": 1045 }, { "epoch": 0.08143642645126759, "grad_norm": 3.9627095330534097, "learning_rate": 3.96134503060314e-05, "loss": 1.6414, "step": 1046 }, { "epoch": 0.0815142815434772, "grad_norm": 3.2918143559230724, "learning_rate": 3.961247301659433e-05, "loss": 1.5875, "step": 1047 }, { "epoch": 0.08159213663568683, "grad_norm": 3.2101209872383603, "learning_rate": 3.961149450539238e-05, "loss": 1.7029, "step": 1048 }, { "epoch": 0.08166999172789645, "grad_norm": 4.17333093126589, "learning_rate": 3.961051477248651e-05, "loss": 1.6768, "step": 1049 }, { "epoch": 0.08174784682010608, "grad_norm": 3.067265038537666, "learning_rate": 3.9609533817937736e-05, "loss": 1.6504, "step": 1050 }, { "epoch": 0.08174784682010608, "eval_loss": 0.2078818529844284, "eval_runtime": 162.1671, "eval_samples_per_second": 17.759, "eval_steps_per_second": 0.635, "step": 1050 }, { "epoch": 0.0818257019123157, "grad_norm": 2.922183833730028, "learning_rate": 3.960855164180719e-05, "loss": 1.7309, "step": 1051 }, { "epoch": 0.08190355700452533, "grad_norm": 3.334340491472239, "learning_rate": 3.960756824415604e-05, "loss": 1.6105, "step": 1052 }, { "epoch": 0.08198141209673496, "grad_norm": 2.8456578244119988, "learning_rate": 3.960658362504555e-05, "loss": 1.6522, "step": 1053 }, { "epoch": 0.08205926718894457, "grad_norm": 3.0741751358689684, "learning_rate": 3.9605597784537065e-05, "loss": 1.6295, "step": 1054 }, { "epoch": 0.0821371222811542, "grad_norm": 3.393446358627214, "learning_rate": 3.9604610722691984e-05, "loss": 1.697, "step": 1055 }, { "epoch": 0.08221497737336382, "grad_norm": 3.357427560663552, "learning_rate": 3.960362243957181e-05, "loss": 1.6603, "step": 1056 }, { "epoch": 0.08229283246557345, "grad_norm": 3.137000437483111, "learning_rate": 3.960263293523811e-05, "loss": 1.6662, "step": 1057 }, { "epoch": 0.08237068755778308, "grad_norm": 3.0089241460641034, "learning_rate": 3.96016422097525e-05, "loss": 1.626, "step": 1058 }, { "epoch": 0.0824485426499927, "grad_norm": 3.1463658831606534, "learning_rate": 3.9600650263176726e-05, "loss": 1.5795, "step": 1059 }, { "epoch": 0.08252639774220233, "grad_norm": 2.863982063408455, "learning_rate": 3.959965709557258e-05, "loss": 1.6214, "step": 1060 }, { "epoch": 0.08260425283441195, "grad_norm": 3.1812279847661746, "learning_rate": 3.9598662707001915e-05, "loss": 1.6611, "step": 1061 }, { "epoch": 0.08268210792662158, "grad_norm": 3.073675448315599, "learning_rate": 3.959766709752669e-05, "loss": 1.6496, "step": 1062 }, { "epoch": 0.0827599630188312, "grad_norm": 3.319699907640211, "learning_rate": 3.9596670267208914e-05, "loss": 1.625, "step": 1063 }, { "epoch": 0.08283781811104082, "grad_norm": 3.067191061888492, "learning_rate": 3.95956722161107e-05, "loss": 1.6782, "step": 1064 }, { "epoch": 0.08291567320325045, "grad_norm": 3.143485517305322, "learning_rate": 3.959467294429421e-05, "loss": 1.637, "step": 1065 }, { "epoch": 0.08299352829546007, "grad_norm": 3.1759734642004602, "learning_rate": 3.95936724518217e-05, "loss": 1.6919, "step": 1066 }, { "epoch": 0.0830713833876697, "grad_norm": 2.980594355561054, "learning_rate": 3.9592670738755495e-05, "loss": 1.5951, "step": 1067 }, { "epoch": 0.08314923847987933, "grad_norm": 2.649570130848916, "learning_rate": 3.9591667805157995e-05, "loss": 1.6327, "step": 1068 }, { "epoch": 0.08322709357208895, "grad_norm": 3.942105450539992, "learning_rate": 3.959066365109169e-05, "loss": 1.6641, "step": 1069 }, { "epoch": 0.08330494866429858, "grad_norm": 3.222143708865712, "learning_rate": 3.958965827661911e-05, "loss": 1.7263, "step": 1070 }, { "epoch": 0.0833828037565082, "grad_norm": 3.175088594283315, "learning_rate": 3.958865168180291e-05, "loss": 1.6111, "step": 1071 }, { "epoch": 0.08346065884871783, "grad_norm": 3.3366761020203177, "learning_rate": 3.958764386670577e-05, "loss": 1.5455, "step": 1072 }, { "epoch": 0.08353851394092744, "grad_norm": 3.1584122435775126, "learning_rate": 3.95866348313905e-05, "loss": 1.7077, "step": 1073 }, { "epoch": 0.08361636903313707, "grad_norm": 3.2117669651895584, "learning_rate": 3.958562457591994e-05, "loss": 1.6989, "step": 1074 }, { "epoch": 0.0836942241253467, "grad_norm": 3.481685465923124, "learning_rate": 3.9584613100357036e-05, "loss": 1.6069, "step": 1075 }, { "epoch": 0.08377207921755632, "grad_norm": 3.201794113355911, "learning_rate": 3.958360040476478e-05, "loss": 1.604, "step": 1076 }, { "epoch": 0.08384993430976595, "grad_norm": 3.3334484825729693, "learning_rate": 3.958258648920628e-05, "loss": 1.62, "step": 1077 }, { "epoch": 0.08392778940197557, "grad_norm": 3.451083886944727, "learning_rate": 3.958157135374469e-05, "loss": 1.5571, "step": 1078 }, { "epoch": 0.0840056444941852, "grad_norm": 3.6178842347165356, "learning_rate": 3.9580554998443244e-05, "loss": 1.6756, "step": 1079 }, { "epoch": 0.08408349958639483, "grad_norm": 3.7862671781309363, "learning_rate": 3.957953742336525e-05, "loss": 1.7722, "step": 1080 }, { "epoch": 0.08416135467860444, "grad_norm": 3.0543940967785317, "learning_rate": 3.9578518628574114e-05, "loss": 1.6149, "step": 1081 }, { "epoch": 0.08423920977081407, "grad_norm": 3.231156367738774, "learning_rate": 3.957749861413329e-05, "loss": 1.6335, "step": 1082 }, { "epoch": 0.08431706486302369, "grad_norm": 3.511751623666774, "learning_rate": 3.957647738010633e-05, "loss": 1.6512, "step": 1083 }, { "epoch": 0.08439491995523332, "grad_norm": 3.2973256254289165, "learning_rate": 3.957545492655684e-05, "loss": 1.6762, "step": 1084 }, { "epoch": 0.08447277504744295, "grad_norm": 3.31112841443361, "learning_rate": 3.957443125354853e-05, "loss": 1.6097, "step": 1085 }, { "epoch": 0.08455063013965257, "grad_norm": 2.990221007502546, "learning_rate": 3.9573406361145154e-05, "loss": 1.6509, "step": 1086 }, { "epoch": 0.0846284852318622, "grad_norm": 3.2320313966927996, "learning_rate": 3.957238024941057e-05, "loss": 1.591, "step": 1087 }, { "epoch": 0.08470634032407182, "grad_norm": 3.0891909130307202, "learning_rate": 3.9571352918408694e-05, "loss": 1.6413, "step": 1088 }, { "epoch": 0.08478419541628145, "grad_norm": 4.374532934175237, "learning_rate": 3.9570324368203516e-05, "loss": 1.6973, "step": 1089 }, { "epoch": 0.08486205050849108, "grad_norm": 3.519876409099006, "learning_rate": 3.956929459885912e-05, "loss": 1.5408, "step": 1090 }, { "epoch": 0.0849399056007007, "grad_norm": 3.805755924639585, "learning_rate": 3.956826361043967e-05, "loss": 1.6353, "step": 1091 }, { "epoch": 0.08501776069291032, "grad_norm": 4.058037341070297, "learning_rate": 3.956723140300936e-05, "loss": 1.6785, "step": 1092 }, { "epoch": 0.08509561578511994, "grad_norm": 3.9908829232289267, "learning_rate": 3.956619797663251e-05, "loss": 1.6998, "step": 1093 }, { "epoch": 0.08517347087732957, "grad_norm": 3.861808451195817, "learning_rate": 3.95651633313735e-05, "loss": 1.6663, "step": 1094 }, { "epoch": 0.0852513259695392, "grad_norm": 3.2913469549034975, "learning_rate": 3.956412746729678e-05, "loss": 1.5891, "step": 1095 }, { "epoch": 0.08532918106174882, "grad_norm": 3.551306986294235, "learning_rate": 3.956309038446687e-05, "loss": 1.5671, "step": 1096 }, { "epoch": 0.08540703615395845, "grad_norm": 3.1170099876651114, "learning_rate": 3.956205208294839e-05, "loss": 1.5285, "step": 1097 }, { "epoch": 0.08548489124616807, "grad_norm": 3.5022146349851577, "learning_rate": 3.9561012562806016e-05, "loss": 1.6908, "step": 1098 }, { "epoch": 0.0855627463383777, "grad_norm": 3.442959731348132, "learning_rate": 3.95599718241045e-05, "loss": 1.6071, "step": 1099 }, { "epoch": 0.08564060143058731, "grad_norm": 2.9845517161868163, "learning_rate": 3.955892986690868e-05, "loss": 1.6405, "step": 1100 }, { "epoch": 0.08564060143058731, "eval_loss": 0.2024768888950348, "eval_runtime": 161.9548, "eval_samples_per_second": 17.783, "eval_steps_per_second": 0.636, "step": 1100 }, { "epoch": 0.08571845652279694, "grad_norm": 4.227765806256807, "learning_rate": 3.955788669128347e-05, "loss": 1.6716, "step": 1101 }, { "epoch": 0.08579631161500657, "grad_norm": 3.1783897361009448, "learning_rate": 3.955684229729384e-05, "loss": 1.6047, "step": 1102 }, { "epoch": 0.08587416670721619, "grad_norm": 4.1410330522731185, "learning_rate": 3.955579668500487e-05, "loss": 1.5989, "step": 1103 }, { "epoch": 0.08595202179942582, "grad_norm": 3.414820898926263, "learning_rate": 3.9554749854481684e-05, "loss": 1.5626, "step": 1104 }, { "epoch": 0.08602987689163544, "grad_norm": 3.5661807015388463, "learning_rate": 3.955370180578949e-05, "loss": 1.5037, "step": 1105 }, { "epoch": 0.08610773198384507, "grad_norm": 3.3356355577929433, "learning_rate": 3.9552652538993594e-05, "loss": 1.5379, "step": 1106 }, { "epoch": 0.0861855870760547, "grad_norm": 3.1164514763849325, "learning_rate": 3.955160205415935e-05, "loss": 1.5046, "step": 1107 }, { "epoch": 0.08626344216826431, "grad_norm": 3.3522343486482686, "learning_rate": 3.955055035135219e-05, "loss": 1.5465, "step": 1108 }, { "epoch": 0.08634129726047395, "grad_norm": 3.0202279625308885, "learning_rate": 3.9549497430637645e-05, "loss": 1.4922, "step": 1109 }, { "epoch": 0.08641915235268356, "grad_norm": 3.1995919384806903, "learning_rate": 3.95484432920813e-05, "loss": 1.5764, "step": 1110 }, { "epoch": 0.08649700744489319, "grad_norm": 3.421789178505091, "learning_rate": 3.9547387935748825e-05, "loss": 1.5648, "step": 1111 }, { "epoch": 0.08657486253710282, "grad_norm": 3.203969538915921, "learning_rate": 3.9546331361705955e-05, "loss": 1.655, "step": 1112 }, { "epoch": 0.08665271762931244, "grad_norm": 4.152267887813613, "learning_rate": 3.954527357001853e-05, "loss": 1.7236, "step": 1113 }, { "epoch": 0.08673057272152207, "grad_norm": 2.996441882219047, "learning_rate": 3.954421456075242e-05, "loss": 1.5556, "step": 1114 }, { "epoch": 0.08680842781373169, "grad_norm": 3.7458492449587504, "learning_rate": 3.954315433397361e-05, "loss": 1.5829, "step": 1115 }, { "epoch": 0.08688628290594132, "grad_norm": 3.077911606583266, "learning_rate": 3.9542092889748155e-05, "loss": 1.6486, "step": 1116 }, { "epoch": 0.08696413799815095, "grad_norm": 3.342400088342974, "learning_rate": 3.9541030228142167e-05, "loss": 1.5628, "step": 1117 }, { "epoch": 0.08704199309036056, "grad_norm": 3.20909556182645, "learning_rate": 3.9539966349221836e-05, "loss": 1.5874, "step": 1118 }, { "epoch": 0.0871198481825702, "grad_norm": 2.952223518058784, "learning_rate": 3.9538901253053456e-05, "loss": 1.5317, "step": 1119 }, { "epoch": 0.08719770327477981, "grad_norm": 3.41006133015798, "learning_rate": 3.953783493970337e-05, "loss": 1.4436, "step": 1120 }, { "epoch": 0.08727555836698944, "grad_norm": 3.205399316018049, "learning_rate": 3.9536767409238e-05, "loss": 1.5511, "step": 1121 }, { "epoch": 0.08735341345919907, "grad_norm": 3.2873000869364923, "learning_rate": 3.953569866172386e-05, "loss": 1.5185, "step": 1122 }, { "epoch": 0.08743126855140869, "grad_norm": 3.2321251277184277, "learning_rate": 3.95346286972275e-05, "loss": 1.5295, "step": 1123 }, { "epoch": 0.08750912364361832, "grad_norm": 3.447295200311108, "learning_rate": 3.9533557515815614e-05, "loss": 1.5626, "step": 1124 }, { "epoch": 0.08758697873582794, "grad_norm": 2.851335893616625, "learning_rate": 3.953248511755489e-05, "loss": 1.5269, "step": 1125 }, { "epoch": 0.08766483382803757, "grad_norm": 3.104052165956419, "learning_rate": 3.953141150251217e-05, "loss": 1.6446, "step": 1126 }, { "epoch": 0.08774268892024718, "grad_norm": 3.140856784396237, "learning_rate": 3.9530336670754314e-05, "loss": 1.5385, "step": 1127 }, { "epoch": 0.08782054401245681, "grad_norm": 2.8389007440209992, "learning_rate": 3.952926062234829e-05, "loss": 1.543, "step": 1128 }, { "epoch": 0.08789839910466644, "grad_norm": 3.208214782341899, "learning_rate": 3.952818335736111e-05, "loss": 1.5246, "step": 1129 }, { "epoch": 0.08797625419687606, "grad_norm": 2.8837274203101924, "learning_rate": 3.952710487585991e-05, "loss": 1.5468, "step": 1130 }, { "epoch": 0.08805410928908569, "grad_norm": 3.1529517801056746, "learning_rate": 3.952602517791186e-05, "loss": 1.6157, "step": 1131 }, { "epoch": 0.08813196438129531, "grad_norm": 3.1378447002823173, "learning_rate": 3.952494426358422e-05, "loss": 1.5103, "step": 1132 }, { "epoch": 0.08820981947350494, "grad_norm": 2.871432238338474, "learning_rate": 3.952386213294433e-05, "loss": 1.4982, "step": 1133 }, { "epoch": 0.08828767456571457, "grad_norm": 3.047919280954517, "learning_rate": 3.952277878605959e-05, "loss": 1.5725, "step": 1134 }, { "epoch": 0.08836552965792419, "grad_norm": 3.0820702308164836, "learning_rate": 3.9521694222997505e-05, "loss": 1.5224, "step": 1135 }, { "epoch": 0.08844338475013382, "grad_norm": 2.982254496079295, "learning_rate": 3.952060844382564e-05, "loss": 1.5555, "step": 1136 }, { "epoch": 0.08852123984234343, "grad_norm": 3.1491898759090353, "learning_rate": 3.9519521448611604e-05, "loss": 1.5601, "step": 1137 }, { "epoch": 0.08859909493455306, "grad_norm": 3.1028235195098857, "learning_rate": 3.951843323742315e-05, "loss": 1.5668, "step": 1138 }, { "epoch": 0.0886769500267627, "grad_norm": 2.9319274253277783, "learning_rate": 3.951734381032803e-05, "loss": 1.6285, "step": 1139 }, { "epoch": 0.08875480511897231, "grad_norm": 3.4176104743281353, "learning_rate": 3.9516253167394145e-05, "loss": 1.6045, "step": 1140 }, { "epoch": 0.08883266021118194, "grad_norm": 3.0805805376399986, "learning_rate": 3.951516130868942e-05, "loss": 1.5459, "step": 1141 }, { "epoch": 0.08891051530339156, "grad_norm": 3.0742525917577197, "learning_rate": 3.951406823428187e-05, "loss": 1.5809, "step": 1142 }, { "epoch": 0.08898837039560119, "grad_norm": 2.995130157015672, "learning_rate": 3.95129739442396e-05, "loss": 1.615, "step": 1143 }, { "epoch": 0.08906622548781082, "grad_norm": 3.4195717436798465, "learning_rate": 3.9511878438630764e-05, "loss": 1.6002, "step": 1144 }, { "epoch": 0.08914408058002044, "grad_norm": 3.139735414611809, "learning_rate": 3.951078171752362e-05, "loss": 1.5173, "step": 1145 }, { "epoch": 0.08922193567223007, "grad_norm": 3.848523806146937, "learning_rate": 3.950968378098648e-05, "loss": 1.5329, "step": 1146 }, { "epoch": 0.08929979076443968, "grad_norm": 3.4196368800958594, "learning_rate": 3.950858462908775e-05, "loss": 1.5502, "step": 1147 }, { "epoch": 0.08937764585664931, "grad_norm": 3.82011307795574, "learning_rate": 3.9507484261895896e-05, "loss": 1.6601, "step": 1148 }, { "epoch": 0.08945550094885894, "grad_norm": 3.3168007767650796, "learning_rate": 3.9506382679479465e-05, "loss": 1.5743, "step": 1149 }, { "epoch": 0.08953335604106856, "grad_norm": 3.3428847134262565, "learning_rate": 3.9505279881907074e-05, "loss": 1.5547, "step": 1150 }, { "epoch": 0.08953335604106856, "eval_loss": 0.1989397257566452, "eval_runtime": 161.9553, "eval_samples_per_second": 17.783, "eval_steps_per_second": 0.636, "step": 1150 }, { "epoch": 0.08961121113327819, "grad_norm": 3.2399895306980624, "learning_rate": 3.9504175869247435e-05, "loss": 1.5049, "step": 1151 }, { "epoch": 0.0896890662254878, "grad_norm": 3.1078894984789107, "learning_rate": 3.950307064156932e-05, "loss": 1.53, "step": 1152 }, { "epoch": 0.08976692131769744, "grad_norm": 3.483762076052625, "learning_rate": 3.950196419894157e-05, "loss": 1.4556, "step": 1153 }, { "epoch": 0.08984477640990705, "grad_norm": 2.9015098185141586, "learning_rate": 3.950085654143312e-05, "loss": 1.5613, "step": 1154 }, { "epoch": 0.08992263150211668, "grad_norm": 3.267495750545399, "learning_rate": 3.949974766911298e-05, "loss": 1.5104, "step": 1155 }, { "epoch": 0.09000048659432631, "grad_norm": 3.1198661059996553, "learning_rate": 3.9498637582050205e-05, "loss": 1.5935, "step": 1156 }, { "epoch": 0.09007834168653593, "grad_norm": 3.10070081444124, "learning_rate": 3.9497526280313963e-05, "loss": 1.5805, "step": 1157 }, { "epoch": 0.09015619677874556, "grad_norm": 3.4146145815360067, "learning_rate": 3.949641376397348e-05, "loss": 1.6033, "step": 1158 }, { "epoch": 0.09023405187095518, "grad_norm": 3.1826480665358905, "learning_rate": 3.949530003309806e-05, "loss": 1.5536, "step": 1159 }, { "epoch": 0.09031190696316481, "grad_norm": 3.190462108393409, "learning_rate": 3.949418508775708e-05, "loss": 1.5993, "step": 1160 }, { "epoch": 0.09038976205537444, "grad_norm": 3.0983055714492367, "learning_rate": 3.9493068928020006e-05, "loss": 1.5102, "step": 1161 }, { "epoch": 0.09046761714758406, "grad_norm": 3.6836920667317705, "learning_rate": 3.949195155395637e-05, "loss": 1.6833, "step": 1162 }, { "epoch": 0.09054547223979369, "grad_norm": 2.952151822159848, "learning_rate": 3.949083296563576e-05, "loss": 1.4897, "step": 1163 }, { "epoch": 0.0906233273320033, "grad_norm": 3.3815318125659655, "learning_rate": 3.948971316312788e-05, "loss": 1.568, "step": 1164 }, { "epoch": 0.09070118242421293, "grad_norm": 3.1050426078270226, "learning_rate": 3.948859214650247e-05, "loss": 1.5017, "step": 1165 }, { "epoch": 0.09077903751642256, "grad_norm": 3.35156920722331, "learning_rate": 3.9487469915829376e-05, "loss": 1.5369, "step": 1166 }, { "epoch": 0.09085689260863218, "grad_norm": 3.8537363307816777, "learning_rate": 3.948634647117852e-05, "loss": 1.4854, "step": 1167 }, { "epoch": 0.09093474770084181, "grad_norm": 3.2057431444135744, "learning_rate": 3.948522181261985e-05, "loss": 1.5368, "step": 1168 }, { "epoch": 0.09101260279305143, "grad_norm": 3.6393457711444896, "learning_rate": 3.948409594022347e-05, "loss": 1.5847, "step": 1169 }, { "epoch": 0.09109045788526106, "grad_norm": 3.1696693891034253, "learning_rate": 3.948296885405948e-05, "loss": 1.469, "step": 1170 }, { "epoch": 0.09116831297747069, "grad_norm": 3.6194200507568137, "learning_rate": 3.9481840554198114e-05, "loss": 1.5261, "step": 1171 }, { "epoch": 0.0912461680696803, "grad_norm": 3.3955385958900446, "learning_rate": 3.9480711040709664e-05, "loss": 1.5394, "step": 1172 }, { "epoch": 0.09132402316188994, "grad_norm": 3.5277070033647164, "learning_rate": 3.947958031366447e-05, "loss": 1.5942, "step": 1173 }, { "epoch": 0.09140187825409955, "grad_norm": 3.091117776597006, "learning_rate": 3.947844837313299e-05, "loss": 1.579, "step": 1174 }, { "epoch": 0.09147973334630918, "grad_norm": 3.062541502858858, "learning_rate": 3.9477315219185737e-05, "loss": 1.579, "step": 1175 }, { "epoch": 0.09155758843851881, "grad_norm": 2.8767040453318846, "learning_rate": 3.947618085189329e-05, "loss": 1.5236, "step": 1176 }, { "epoch": 0.09163544353072843, "grad_norm": 3.100336380554377, "learning_rate": 3.947504527132633e-05, "loss": 1.5379, "step": 1177 }, { "epoch": 0.09171329862293806, "grad_norm": 2.9934645194887883, "learning_rate": 3.947390847755559e-05, "loss": 1.5052, "step": 1178 }, { "epoch": 0.09179115371514768, "grad_norm": 3.0635626613143936, "learning_rate": 3.9472770470651876e-05, "loss": 1.4937, "step": 1179 }, { "epoch": 0.09186900880735731, "grad_norm": 2.9620763186572265, "learning_rate": 3.94716312506861e-05, "loss": 1.5058, "step": 1180 }, { "epoch": 0.09194686389956692, "grad_norm": 2.8347718740082395, "learning_rate": 3.947049081772922e-05, "loss": 1.489, "step": 1181 }, { "epoch": 0.09202471899177656, "grad_norm": 2.8257807540568334, "learning_rate": 3.9469349171852284e-05, "loss": 1.5349, "step": 1182 }, { "epoch": 0.09210257408398619, "grad_norm": 3.060587239586157, "learning_rate": 3.94682063131264e-05, "loss": 1.4558, "step": 1183 }, { "epoch": 0.0921804291761958, "grad_norm": 2.8742292106489984, "learning_rate": 3.9467062241622785e-05, "loss": 1.5213, "step": 1184 }, { "epoch": 0.09225828426840543, "grad_norm": 2.7528450271348897, "learning_rate": 3.946591695741269e-05, "loss": 1.5343, "step": 1185 }, { "epoch": 0.09233613936061505, "grad_norm": 3.07116908454457, "learning_rate": 3.946477046056746e-05, "loss": 1.4874, "step": 1186 }, { "epoch": 0.09241399445282468, "grad_norm": 3.018694661993361, "learning_rate": 3.9463622751158534e-05, "loss": 1.4874, "step": 1187 }, { "epoch": 0.09249184954503431, "grad_norm": 2.677692387096039, "learning_rate": 3.946247382925739e-05, "loss": 1.4264, "step": 1188 }, { "epoch": 0.09256970463724393, "grad_norm": 3.0774964240896105, "learning_rate": 3.946132369493561e-05, "loss": 1.511, "step": 1189 }, { "epoch": 0.09264755972945356, "grad_norm": 2.909096038553044, "learning_rate": 3.946017234826484e-05, "loss": 1.4724, "step": 1190 }, { "epoch": 0.09272541482166317, "grad_norm": 3.236071534638894, "learning_rate": 3.945901978931681e-05, "loss": 1.5033, "step": 1191 }, { "epoch": 0.0928032699138728, "grad_norm": 3.052358895899234, "learning_rate": 3.945786601816331e-05, "loss": 1.5694, "step": 1192 }, { "epoch": 0.09288112500608244, "grad_norm": 2.9351133129751847, "learning_rate": 3.9456711034876203e-05, "loss": 1.4794, "step": 1193 }, { "epoch": 0.09295898009829205, "grad_norm": 2.804560215132515, "learning_rate": 3.9455554839527466e-05, "loss": 1.4812, "step": 1194 }, { "epoch": 0.09303683519050168, "grad_norm": 2.952341286421733, "learning_rate": 3.945439743218911e-05, "loss": 1.4962, "step": 1195 }, { "epoch": 0.0931146902827113, "grad_norm": 3.058539130439525, "learning_rate": 3.945323881293324e-05, "loss": 1.5172, "step": 1196 }, { "epoch": 0.09319254537492093, "grad_norm": 2.7153212816007266, "learning_rate": 3.945207898183203e-05, "loss": 1.4698, "step": 1197 }, { "epoch": 0.09327040046713056, "grad_norm": 2.9986019409395253, "learning_rate": 3.9450917938957725e-05, "loss": 1.5195, "step": 1198 }, { "epoch": 0.09334825555934018, "grad_norm": 2.912262631922786, "learning_rate": 3.9449755684382665e-05, "loss": 1.4937, "step": 1199 }, { "epoch": 0.0934261106515498, "grad_norm": 2.7048633926132704, "learning_rate": 3.9448592218179246e-05, "loss": 1.5366, "step": 1200 }, { "epoch": 0.0934261106515498, "eval_loss": 0.18920864164829254, "eval_runtime": 161.9409, "eval_samples_per_second": 17.784, "eval_steps_per_second": 0.636, "step": 1200 }, { "epoch": 0.09350396574375942, "grad_norm": 2.8284923700600686, "learning_rate": 3.9447427540419955e-05, "loss": 1.4439, "step": 1201 }, { "epoch": 0.09358182083596905, "grad_norm": 2.618413581130501, "learning_rate": 3.944626165117733e-05, "loss": 1.3816, "step": 1202 }, { "epoch": 0.09365967592817867, "grad_norm": 3.376988843676237, "learning_rate": 3.944509455052401e-05, "loss": 1.5904, "step": 1203 }, { "epoch": 0.0937375310203883, "grad_norm": 3.0302269998814015, "learning_rate": 3.9443926238532706e-05, "loss": 1.5082, "step": 1204 }, { "epoch": 0.09381538611259793, "grad_norm": 3.240220499422712, "learning_rate": 3.944275671527619e-05, "loss": 1.5764, "step": 1205 }, { "epoch": 0.09389324120480755, "grad_norm": 3.042074996371278, "learning_rate": 3.944158598082732e-05, "loss": 1.5483, "step": 1206 }, { "epoch": 0.09397109629701718, "grad_norm": 2.993801602890413, "learning_rate": 3.944041403525902e-05, "loss": 1.5589, "step": 1207 }, { "epoch": 0.0940489513892268, "grad_norm": 2.9203790479482516, "learning_rate": 3.9439240878644315e-05, "loss": 1.4524, "step": 1208 }, { "epoch": 0.09412680648143643, "grad_norm": 3.0938952868737264, "learning_rate": 3.943806651105626e-05, "loss": 1.5897, "step": 1209 }, { "epoch": 0.09420466157364606, "grad_norm": 2.8702493985272204, "learning_rate": 3.943689093256804e-05, "loss": 1.4528, "step": 1210 }, { "epoch": 0.09428251666585567, "grad_norm": 2.904116804944989, "learning_rate": 3.943571414325288e-05, "loss": 1.494, "step": 1211 }, { "epoch": 0.0943603717580653, "grad_norm": 2.993894334410336, "learning_rate": 3.943453614318408e-05, "loss": 1.4256, "step": 1212 }, { "epoch": 0.09443822685027492, "grad_norm": 3.0680640246602766, "learning_rate": 3.9433356932435024e-05, "loss": 1.5835, "step": 1213 }, { "epoch": 0.09451608194248455, "grad_norm": 2.8155112427954014, "learning_rate": 3.943217651107918e-05, "loss": 1.4722, "step": 1214 }, { "epoch": 0.09459393703469418, "grad_norm": 2.8145530948683004, "learning_rate": 3.943099487919008e-05, "loss": 1.4858, "step": 1215 }, { "epoch": 0.0946717921269038, "grad_norm": 2.7999683404970854, "learning_rate": 3.942981203684133e-05, "loss": 1.5529, "step": 1216 }, { "epoch": 0.09474964721911343, "grad_norm": 2.7580081938317464, "learning_rate": 3.942862798410662e-05, "loss": 1.4968, "step": 1217 }, { "epoch": 0.09482750231132304, "grad_norm": 3.0247399602998413, "learning_rate": 3.942744272105971e-05, "loss": 1.4524, "step": 1218 }, { "epoch": 0.09490535740353268, "grad_norm": 3.0629176076968565, "learning_rate": 3.942625624777443e-05, "loss": 1.5248, "step": 1219 }, { "epoch": 0.0949832124957423, "grad_norm": 3.1102049604650306, "learning_rate": 3.9425068564324705e-05, "loss": 1.4676, "step": 1220 }, { "epoch": 0.09506106758795192, "grad_norm": 3.124942565172596, "learning_rate": 3.942387967078451e-05, "loss": 1.3767, "step": 1221 }, { "epoch": 0.09513892268016155, "grad_norm": 2.8595783130512022, "learning_rate": 3.942268956722791e-05, "loss": 1.5087, "step": 1222 }, { "epoch": 0.09521677777237117, "grad_norm": 3.043752344028645, "learning_rate": 3.9421498253729055e-05, "loss": 1.4956, "step": 1223 }, { "epoch": 0.0952946328645808, "grad_norm": 2.975724718116325, "learning_rate": 3.9420305730362134e-05, "loss": 1.4615, "step": 1224 }, { "epoch": 0.09537248795679043, "grad_norm": 2.796625344644714, "learning_rate": 3.941911199720146e-05, "loss": 1.4465, "step": 1225 }, { "epoch": 0.09545034304900005, "grad_norm": 2.805149299093499, "learning_rate": 3.941791705432138e-05, "loss": 1.5494, "step": 1226 }, { "epoch": 0.09552819814120968, "grad_norm": 3.1716189137891235, "learning_rate": 3.9416720901796344e-05, "loss": 1.4931, "step": 1227 }, { "epoch": 0.0956060532334193, "grad_norm": 2.8161138546581284, "learning_rate": 3.941552353970086e-05, "loss": 1.5042, "step": 1228 }, { "epoch": 0.09568390832562892, "grad_norm": 2.818442934054134, "learning_rate": 3.9414324968109516e-05, "loss": 1.5409, "step": 1229 }, { "epoch": 0.09576176341783854, "grad_norm": 3.5367740604360023, "learning_rate": 3.941312518709699e-05, "loss": 1.5562, "step": 1230 }, { "epoch": 0.09583961851004817, "grad_norm": 2.975808825671094, "learning_rate": 3.941192419673801e-05, "loss": 1.459, "step": 1231 }, { "epoch": 0.0959174736022578, "grad_norm": 2.899057951321596, "learning_rate": 3.94107219971074e-05, "loss": 1.5093, "step": 1232 }, { "epoch": 0.09599532869446742, "grad_norm": 3.105824678150783, "learning_rate": 3.940951858828005e-05, "loss": 1.4397, "step": 1233 }, { "epoch": 0.09607318378667705, "grad_norm": 3.1942866447007576, "learning_rate": 3.9408313970330914e-05, "loss": 1.5051, "step": 1234 }, { "epoch": 0.09615103887888667, "grad_norm": 2.816584535286514, "learning_rate": 3.940710814333505e-05, "loss": 1.486, "step": 1235 }, { "epoch": 0.0962288939710963, "grad_norm": 3.065338133827699, "learning_rate": 3.940590110736757e-05, "loss": 1.5234, "step": 1236 }, { "epoch": 0.09630674906330593, "grad_norm": 3.1119536360768874, "learning_rate": 3.9404692862503676e-05, "loss": 1.5366, "step": 1237 }, { "epoch": 0.09638460415551554, "grad_norm": 3.131602125122416, "learning_rate": 3.940348340881862e-05, "loss": 1.6474, "step": 1238 }, { "epoch": 0.09646245924772517, "grad_norm": 3.20274644389025, "learning_rate": 3.940227274638775e-05, "loss": 1.502, "step": 1239 }, { "epoch": 0.09654031433993479, "grad_norm": 3.0147222504616282, "learning_rate": 3.9401060875286485e-05, "loss": 1.3981, "step": 1240 }, { "epoch": 0.09661816943214442, "grad_norm": 3.1798245904922755, "learning_rate": 3.939984779559033e-05, "loss": 1.5576, "step": 1241 }, { "epoch": 0.09669602452435405, "grad_norm": 3.1422132853307194, "learning_rate": 3.939863350737483e-05, "loss": 1.5132, "step": 1242 }, { "epoch": 0.09677387961656367, "grad_norm": 3.070000533609544, "learning_rate": 3.939741801071566e-05, "loss": 1.469, "step": 1243 }, { "epoch": 0.0968517347087733, "grad_norm": 3.267845129204256, "learning_rate": 3.9396201305688516e-05, "loss": 1.5234, "step": 1244 }, { "epoch": 0.09692958980098292, "grad_norm": 3.0811329651466535, "learning_rate": 3.93949833923692e-05, "loss": 1.4473, "step": 1245 }, { "epoch": 0.09700744489319255, "grad_norm": 3.2770697772587463, "learning_rate": 3.939376427083358e-05, "loss": 1.4995, "step": 1246 }, { "epoch": 0.09708529998540218, "grad_norm": 2.9062922035946213, "learning_rate": 3.939254394115761e-05, "loss": 1.4907, "step": 1247 }, { "epoch": 0.09716315507761179, "grad_norm": 2.977228003082128, "learning_rate": 3.939132240341731e-05, "loss": 1.4692, "step": 1248 }, { "epoch": 0.09724101016982142, "grad_norm": 3.0803388009900203, "learning_rate": 3.9390099657688766e-05, "loss": 1.5386, "step": 1249 }, { "epoch": 0.09731886526203104, "grad_norm": 3.34445797512115, "learning_rate": 3.9388875704048144e-05, "loss": 1.5299, "step": 1250 }, { "epoch": 0.09731886526203104, "eval_loss": 0.18487776815891266, "eval_runtime": 162.0899, "eval_samples_per_second": 17.768, "eval_steps_per_second": 0.635, "step": 1250 }, { "epoch": 0.09739672035424067, "grad_norm": 3.1500804858714946, "learning_rate": 3.938765054257172e-05, "loss": 1.3645, "step": 1251 }, { "epoch": 0.0974745754464503, "grad_norm": 2.9491890155028813, "learning_rate": 3.9386424173335784e-05, "loss": 1.4747, "step": 1252 }, { "epoch": 0.09755243053865992, "grad_norm": 2.775509216198069, "learning_rate": 3.9385196596416755e-05, "loss": 1.4784, "step": 1253 }, { "epoch": 0.09763028563086955, "grad_norm": 2.94454169835259, "learning_rate": 3.938396781189109e-05, "loss": 1.4466, "step": 1254 }, { "epoch": 0.09770814072307916, "grad_norm": 2.7323128909975547, "learning_rate": 3.938273781983535e-05, "loss": 1.4192, "step": 1255 }, { "epoch": 0.0977859958152888, "grad_norm": 2.9987363008190884, "learning_rate": 3.938150662032614e-05, "loss": 1.4399, "step": 1256 }, { "epoch": 0.09786385090749841, "grad_norm": 3.031681056261154, "learning_rate": 3.9380274213440175e-05, "loss": 1.4265, "step": 1257 }, { "epoch": 0.09794170599970804, "grad_norm": 3.064524956816374, "learning_rate": 3.937904059925423e-05, "loss": 1.4969, "step": 1258 }, { "epoch": 0.09801956109191767, "grad_norm": 3.1137542806593648, "learning_rate": 3.9377805777845136e-05, "loss": 1.4889, "step": 1259 }, { "epoch": 0.09809741618412729, "grad_norm": 2.709612714589013, "learning_rate": 3.937656974928983e-05, "loss": 1.4392, "step": 1260 }, { "epoch": 0.09817527127633692, "grad_norm": 2.739195708338639, "learning_rate": 3.93753325136653e-05, "loss": 1.4227, "step": 1261 }, { "epoch": 0.09825312636854654, "grad_norm": 3.179877035979035, "learning_rate": 3.9374094071048625e-05, "loss": 1.4882, "step": 1262 }, { "epoch": 0.09833098146075617, "grad_norm": 2.9838889213729063, "learning_rate": 3.9372854421516966e-05, "loss": 1.4892, "step": 1263 }, { "epoch": 0.0984088365529658, "grad_norm": 2.8629830568038894, "learning_rate": 3.9371613565147535e-05, "loss": 1.4733, "step": 1264 }, { "epoch": 0.09848669164517541, "grad_norm": 2.9889194441867932, "learning_rate": 3.937037150201762e-05, "loss": 1.5036, "step": 1265 }, { "epoch": 0.09856454673738504, "grad_norm": 2.838232123368964, "learning_rate": 3.936912823220463e-05, "loss": 1.4565, "step": 1266 }, { "epoch": 0.09864240182959466, "grad_norm": 2.778778521294773, "learning_rate": 3.936788375578598e-05, "loss": 1.5721, "step": 1267 }, { "epoch": 0.09872025692180429, "grad_norm": 3.477496804069302, "learning_rate": 3.936663807283921e-05, "loss": 1.534, "step": 1268 }, { "epoch": 0.09879811201401392, "grad_norm": 2.9868166159719896, "learning_rate": 3.936539118344193e-05, "loss": 1.4699, "step": 1269 }, { "epoch": 0.09887596710622354, "grad_norm": 2.951368552328689, "learning_rate": 3.9364143087671795e-05, "loss": 1.5036, "step": 1270 }, { "epoch": 0.09895382219843317, "grad_norm": 3.6730774043159022, "learning_rate": 3.936289378560656e-05, "loss": 1.451, "step": 1271 }, { "epoch": 0.09903167729064279, "grad_norm": 3.010394106360847, "learning_rate": 3.9361643277324066e-05, "loss": 1.498, "step": 1272 }, { "epoch": 0.09910953238285242, "grad_norm": 3.413319145485178, "learning_rate": 3.93603915629022e-05, "loss": 1.433, "step": 1273 }, { "epoch": 0.09918738747506205, "grad_norm": 3.234527181266034, "learning_rate": 3.935913864241894e-05, "loss": 1.4361, "step": 1274 }, { "epoch": 0.09926524256727166, "grad_norm": 2.8029928891190927, "learning_rate": 3.935788451595235e-05, "loss": 1.4494, "step": 1275 }, { "epoch": 0.0993430976594813, "grad_norm": 3.922895978349873, "learning_rate": 3.935662918358053e-05, "loss": 1.5624, "step": 1276 }, { "epoch": 0.09942095275169091, "grad_norm": 2.8095791098456364, "learning_rate": 3.935537264538171e-05, "loss": 1.4455, "step": 1277 }, { "epoch": 0.09949880784390054, "grad_norm": 3.4582553255049238, "learning_rate": 3.9354114901434134e-05, "loss": 1.5324, "step": 1278 }, { "epoch": 0.09957666293611017, "grad_norm": 2.9816595853768164, "learning_rate": 3.9352855951816185e-05, "loss": 1.3265, "step": 1279 }, { "epoch": 0.09965451802831979, "grad_norm": 2.8403611257177706, "learning_rate": 3.935159579660627e-05, "loss": 1.4486, "step": 1280 }, { "epoch": 0.09973237312052942, "grad_norm": 3.176370752313597, "learning_rate": 3.9350334435882905e-05, "loss": 1.4916, "step": 1281 }, { "epoch": 0.09981022821273904, "grad_norm": 3.1121290467239042, "learning_rate": 3.9349071869724655e-05, "loss": 1.4733, "step": 1282 }, { "epoch": 0.09988808330494867, "grad_norm": 2.7547882389267535, "learning_rate": 3.9347808098210175e-05, "loss": 1.3569, "step": 1283 }, { "epoch": 0.09996593839715828, "grad_norm": 2.680361572986416, "learning_rate": 3.934654312141819e-05, "loss": 1.4463, "step": 1284 }, { "epoch": 0.10004379348936791, "grad_norm": 3.1303012319161785, "learning_rate": 3.934527693942751e-05, "loss": 1.4685, "step": 1285 }, { "epoch": 0.10012164858157754, "grad_norm": 2.915928467337832, "learning_rate": 3.9344009552317014e-05, "loss": 1.4892, "step": 1286 }, { "epoch": 0.10019950367378716, "grad_norm": 2.8472815587458586, "learning_rate": 3.934274096016564e-05, "loss": 1.4459, "step": 1287 }, { "epoch": 0.10027735876599679, "grad_norm": 2.7509728548930923, "learning_rate": 3.934147116305242e-05, "loss": 1.522, "step": 1288 }, { "epoch": 0.10035521385820641, "grad_norm": 2.6944568883897353, "learning_rate": 3.9340200161056464e-05, "loss": 1.4016, "step": 1289 }, { "epoch": 0.10043306895041604, "grad_norm": 2.670043061097415, "learning_rate": 3.933892795425695e-05, "loss": 1.4844, "step": 1290 }, { "epoch": 0.10051092404262567, "grad_norm": 2.955029746056834, "learning_rate": 3.933765454273312e-05, "loss": 1.4423, "step": 1291 }, { "epoch": 0.10058877913483528, "grad_norm": 2.7906205810053706, "learning_rate": 3.93363799265643e-05, "loss": 1.4127, "step": 1292 }, { "epoch": 0.10066663422704492, "grad_norm": 2.8317583686047225, "learning_rate": 3.933510410582991e-05, "loss": 1.3916, "step": 1293 }, { "epoch": 0.10074448931925453, "grad_norm": 2.9275597124101402, "learning_rate": 3.9333827080609415e-05, "loss": 1.4537, "step": 1294 }, { "epoch": 0.10082234441146416, "grad_norm": 2.9562134477850375, "learning_rate": 3.9332548850982365e-05, "loss": 1.4318, "step": 1295 }, { "epoch": 0.1009001995036738, "grad_norm": 2.8511171911999207, "learning_rate": 3.93312694170284e-05, "loss": 1.3666, "step": 1296 }, { "epoch": 0.10097805459588341, "grad_norm": 2.9385509994671724, "learning_rate": 3.932998877882721e-05, "loss": 1.422, "step": 1297 }, { "epoch": 0.10105590968809304, "grad_norm": 3.0723701243939527, "learning_rate": 3.932870693645858e-05, "loss": 1.4275, "step": 1298 }, { "epoch": 0.10113376478030266, "grad_norm": 2.9792214207138428, "learning_rate": 3.932742389000236e-05, "loss": 1.36, "step": 1299 }, { "epoch": 0.10121161987251229, "grad_norm": 2.7745874940572963, "learning_rate": 3.9326139639538474e-05, "loss": 1.4123, "step": 1300 }, { "epoch": 0.10121161987251229, "eval_loss": 0.17980127036571503, "eval_runtime": 162.0654, "eval_samples_per_second": 17.771, "eval_steps_per_second": 0.636, "step": 1300 }, { "epoch": 0.10128947496472192, "grad_norm": 2.958055086671013, "learning_rate": 3.932485418514694e-05, "loss": 1.3946, "step": 1301 }, { "epoch": 0.10136733005693153, "grad_norm": 2.7682490188795317, "learning_rate": 3.9323567526907816e-05, "loss": 1.4283, "step": 1302 }, { "epoch": 0.10144518514914116, "grad_norm": 3.1527763100247665, "learning_rate": 3.932227966490127e-05, "loss": 1.4741, "step": 1303 }, { "epoch": 0.10152304024135078, "grad_norm": 2.9724719499646124, "learning_rate": 3.9320990599207523e-05, "loss": 1.4473, "step": 1304 }, { "epoch": 0.10160089533356041, "grad_norm": 2.9089113994820766, "learning_rate": 3.9319700329906874e-05, "loss": 1.5072, "step": 1305 }, { "epoch": 0.10167875042577003, "grad_norm": 3.0882819727993067, "learning_rate": 3.931840885707971e-05, "loss": 1.4202, "step": 1306 }, { "epoch": 0.10175660551797966, "grad_norm": 3.2395156038185897, "learning_rate": 3.9317116180806474e-05, "loss": 1.5188, "step": 1307 }, { "epoch": 0.10183446061018929, "grad_norm": 2.8356844759446447, "learning_rate": 3.93158223011677e-05, "loss": 1.4498, "step": 1308 }, { "epoch": 0.1019123157023989, "grad_norm": 3.1866899116956104, "learning_rate": 3.931452721824399e-05, "loss": 1.514, "step": 1309 }, { "epoch": 0.10199017079460854, "grad_norm": 2.9643558407489046, "learning_rate": 3.931323093211602e-05, "loss": 1.3807, "step": 1310 }, { "epoch": 0.10206802588681815, "grad_norm": 3.872664358225831, "learning_rate": 3.9311933442864545e-05, "loss": 1.4946, "step": 1311 }, { "epoch": 0.10214588097902778, "grad_norm": 2.889243833436572, "learning_rate": 3.9310634750570385e-05, "loss": 1.4569, "step": 1312 }, { "epoch": 0.10222373607123741, "grad_norm": 2.9847296560248155, "learning_rate": 3.930933485531445e-05, "loss": 1.4152, "step": 1313 }, { "epoch": 0.10230159116344703, "grad_norm": 2.9361692209251413, "learning_rate": 3.930803375717772e-05, "loss": 1.3955, "step": 1314 }, { "epoch": 0.10237944625565666, "grad_norm": 3.0444058654384083, "learning_rate": 3.9306731456241245e-05, "loss": 1.4026, "step": 1315 }, { "epoch": 0.10245730134786628, "grad_norm": 3.0323712774453497, "learning_rate": 3.930542795258614e-05, "loss": 1.4531, "step": 1316 }, { "epoch": 0.10253515644007591, "grad_norm": 3.0647324570448533, "learning_rate": 3.9304123246293625e-05, "loss": 1.4227, "step": 1317 }, { "epoch": 0.10261301153228554, "grad_norm": 2.976774610721561, "learning_rate": 3.930281733744496e-05, "loss": 1.3517, "step": 1318 }, { "epoch": 0.10269086662449516, "grad_norm": 3.1438802853519467, "learning_rate": 3.930151022612152e-05, "loss": 1.3683, "step": 1319 }, { "epoch": 0.10276872171670479, "grad_norm": 3.1687467010605577, "learning_rate": 3.930020191240471e-05, "loss": 1.5061, "step": 1320 }, { "epoch": 0.1028465768089144, "grad_norm": 3.4365689708753693, "learning_rate": 3.9298892396376036e-05, "loss": 1.4222, "step": 1321 }, { "epoch": 0.10292443190112403, "grad_norm": 2.9564477886302805, "learning_rate": 3.9297581678117084e-05, "loss": 1.4373, "step": 1322 }, { "epoch": 0.10300228699333366, "grad_norm": 2.887377557309, "learning_rate": 3.9296269757709495e-05, "loss": 1.4258, "step": 1323 }, { "epoch": 0.10308014208554328, "grad_norm": 2.907617272340415, "learning_rate": 3.9294956635235004e-05, "loss": 1.4278, "step": 1324 }, { "epoch": 0.10315799717775291, "grad_norm": 2.9207752681409973, "learning_rate": 3.9293642310775413e-05, "loss": 1.3762, "step": 1325 }, { "epoch": 0.10323585226996253, "grad_norm": 2.8257456325216133, "learning_rate": 3.929232678441259e-05, "loss": 1.4192, "step": 1326 }, { "epoch": 0.10331370736217216, "grad_norm": 3.0047522467839074, "learning_rate": 3.9291010056228494e-05, "loss": 1.3844, "step": 1327 }, { "epoch": 0.10339156245438179, "grad_norm": 2.7751204882999647, "learning_rate": 3.928969212630514e-05, "loss": 1.436, "step": 1328 }, { "epoch": 0.1034694175465914, "grad_norm": 2.883329390685121, "learning_rate": 3.928837299472464e-05, "loss": 1.4276, "step": 1329 }, { "epoch": 0.10354727263880104, "grad_norm": 2.9481245685141477, "learning_rate": 3.928705266156917e-05, "loss": 1.4433, "step": 1330 }, { "epoch": 0.10362512773101065, "grad_norm": 2.913121338540078, "learning_rate": 3.928573112692097e-05, "loss": 1.4152, "step": 1331 }, { "epoch": 0.10370298282322028, "grad_norm": 2.7472180241638893, "learning_rate": 3.928440839086237e-05, "loss": 1.4163, "step": 1332 }, { "epoch": 0.1037808379154299, "grad_norm": 2.6721098300585284, "learning_rate": 3.9283084453475785e-05, "loss": 1.3675, "step": 1333 }, { "epoch": 0.10385869300763953, "grad_norm": 2.739393348282329, "learning_rate": 3.928175931484366e-05, "loss": 1.3865, "step": 1334 }, { "epoch": 0.10393654809984916, "grad_norm": 2.8202959340374085, "learning_rate": 3.928043297504857e-05, "loss": 1.3808, "step": 1335 }, { "epoch": 0.10401440319205878, "grad_norm": 2.795703410228862, "learning_rate": 3.927910543417313e-05, "loss": 1.3298, "step": 1336 }, { "epoch": 0.10409225828426841, "grad_norm": 2.8103982660017914, "learning_rate": 3.927777669230005e-05, "loss": 1.371, "step": 1337 }, { "epoch": 0.10417011337647802, "grad_norm": 2.7107202403453172, "learning_rate": 3.927644674951209e-05, "loss": 1.376, "step": 1338 }, { "epoch": 0.10424796846868765, "grad_norm": 2.793218233967198, "learning_rate": 3.92751156058921e-05, "loss": 1.3965, "step": 1339 }, { "epoch": 0.10432582356089729, "grad_norm": 2.842167696466629, "learning_rate": 3.927378326152301e-05, "loss": 1.4288, "step": 1340 }, { "epoch": 0.1044036786531069, "grad_norm": 2.7928143188088392, "learning_rate": 3.9272449716487824e-05, "loss": 1.4626, "step": 1341 }, { "epoch": 0.10448153374531653, "grad_norm": 2.7183071728858645, "learning_rate": 3.927111497086961e-05, "loss": 1.3136, "step": 1342 }, { "epoch": 0.10455938883752615, "grad_norm": 2.8092129626550895, "learning_rate": 3.92697790247515e-05, "loss": 1.433, "step": 1343 }, { "epoch": 0.10463724392973578, "grad_norm": 2.843541167286796, "learning_rate": 3.926844187821675e-05, "loss": 1.3977, "step": 1344 }, { "epoch": 0.10471509902194541, "grad_norm": 2.8116464405027872, "learning_rate": 3.926710353134863e-05, "loss": 1.3961, "step": 1345 }, { "epoch": 0.10479295411415503, "grad_norm": 2.9360338325527278, "learning_rate": 3.9265763984230524e-05, "loss": 1.3986, "step": 1346 }, { "epoch": 0.10487080920636466, "grad_norm": 3.0320597057250347, "learning_rate": 3.9264423236945886e-05, "loss": 1.3706, "step": 1347 }, { "epoch": 0.10494866429857427, "grad_norm": 2.775657673359858, "learning_rate": 3.9263081289578225e-05, "loss": 1.3822, "step": 1348 }, { "epoch": 0.1050265193907839, "grad_norm": 2.8360925665348082, "learning_rate": 3.926173814221114e-05, "loss": 1.3225, "step": 1349 }, { "epoch": 0.10510437448299353, "grad_norm": 2.832064664922912, "learning_rate": 3.926039379492832e-05, "loss": 1.4032, "step": 1350 }, { "epoch": 0.10510437448299353, "eval_loss": 0.17516767978668213, "eval_runtime": 162.1623, "eval_samples_per_second": 17.76, "eval_steps_per_second": 0.635, "step": 1350 }, { "epoch": 0.10518222957520315, "grad_norm": 2.868249324831596, "learning_rate": 3.925904824781349e-05, "loss": 1.302, "step": 1351 }, { "epoch": 0.10526008466741278, "grad_norm": 2.791489551491862, "learning_rate": 3.925770150095048e-05, "loss": 1.3339, "step": 1352 }, { "epoch": 0.1053379397596224, "grad_norm": 2.8295440573696564, "learning_rate": 3.925635355442319e-05, "loss": 1.4102, "step": 1353 }, { "epoch": 0.10541579485183203, "grad_norm": 2.6218295929278237, "learning_rate": 3.925500440831558e-05, "loss": 1.3339, "step": 1354 }, { "epoch": 0.10549364994404166, "grad_norm": 3.022347132132273, "learning_rate": 3.925365406271171e-05, "loss": 1.4079, "step": 1355 }, { "epoch": 0.10557150503625128, "grad_norm": 2.925227848794827, "learning_rate": 3.9252302517695684e-05, "loss": 1.4178, "step": 1356 }, { "epoch": 0.1056493601284609, "grad_norm": 2.899307465320325, "learning_rate": 3.925094977335171e-05, "loss": 1.4293, "step": 1357 }, { "epoch": 0.10572721522067052, "grad_norm": 2.820336304914553, "learning_rate": 3.924959582976405e-05, "loss": 1.3931, "step": 1358 }, { "epoch": 0.10580507031288015, "grad_norm": 2.7865802326805627, "learning_rate": 3.924824068701705e-05, "loss": 1.4094, "step": 1359 }, { "epoch": 0.10588292540508977, "grad_norm": 2.7860155192697156, "learning_rate": 3.924688434519513e-05, "loss": 1.3218, "step": 1360 }, { "epoch": 0.1059607804972994, "grad_norm": 2.664541905767347, "learning_rate": 3.9245526804382785e-05, "loss": 1.373, "step": 1361 }, { "epoch": 0.10603863558950903, "grad_norm": 2.7819642722779276, "learning_rate": 3.924416806466459e-05, "loss": 1.3631, "step": 1362 }, { "epoch": 0.10611649068171865, "grad_norm": 2.7872025110076666, "learning_rate": 3.924280812612517e-05, "loss": 1.4089, "step": 1363 }, { "epoch": 0.10619434577392828, "grad_norm": 2.850925033968412, "learning_rate": 3.9241446988849254e-05, "loss": 1.3749, "step": 1364 }, { "epoch": 0.1062722008661379, "grad_norm": 3.0543150077428454, "learning_rate": 3.924008465292163e-05, "loss": 1.3368, "step": 1365 }, { "epoch": 0.10635005595834753, "grad_norm": 2.9847635409948787, "learning_rate": 3.923872111842718e-05, "loss": 1.3635, "step": 1366 }, { "epoch": 0.10642791105055716, "grad_norm": 3.027036625481377, "learning_rate": 3.9237356385450824e-05, "loss": 1.3931, "step": 1367 }, { "epoch": 0.10650576614276677, "grad_norm": 2.8629848194198355, "learning_rate": 3.923599045407759e-05, "loss": 1.4112, "step": 1368 }, { "epoch": 0.1065836212349764, "grad_norm": 3.069657822521365, "learning_rate": 3.9234623324392573e-05, "loss": 1.4797, "step": 1369 }, { "epoch": 0.10666147632718602, "grad_norm": 2.7414475064653945, "learning_rate": 3.923325499648093e-05, "loss": 1.4104, "step": 1370 }, { "epoch": 0.10673933141939565, "grad_norm": 2.8670422855401756, "learning_rate": 3.923188547042791e-05, "loss": 1.3168, "step": 1371 }, { "epoch": 0.10681718651160528, "grad_norm": 2.821664393326855, "learning_rate": 3.923051474631882e-05, "loss": 1.4293, "step": 1372 }, { "epoch": 0.1068950416038149, "grad_norm": 2.8983727372460426, "learning_rate": 3.922914282423905e-05, "loss": 1.4037, "step": 1373 }, { "epoch": 0.10697289669602453, "grad_norm": 2.7635065503000313, "learning_rate": 3.922776970427407e-05, "loss": 1.4014, "step": 1374 }, { "epoch": 0.10705075178823414, "grad_norm": 2.932590764083402, "learning_rate": 3.9226395386509415e-05, "loss": 1.4421, "step": 1375 }, { "epoch": 0.10712860688044377, "grad_norm": 2.7052514633810394, "learning_rate": 3.9225019871030695e-05, "loss": 1.3265, "step": 1376 }, { "epoch": 0.1072064619726534, "grad_norm": 2.7897399229069433, "learning_rate": 3.922364315792361e-05, "loss": 1.4059, "step": 1377 }, { "epoch": 0.10728431706486302, "grad_norm": 2.8262469561305177, "learning_rate": 3.922226524727391e-05, "loss": 1.4299, "step": 1378 }, { "epoch": 0.10736217215707265, "grad_norm": 2.7424292560600336, "learning_rate": 3.922088613916744e-05, "loss": 1.4549, "step": 1379 }, { "epoch": 0.10744002724928227, "grad_norm": 2.8493441140674896, "learning_rate": 3.921950583369011e-05, "loss": 1.3698, "step": 1380 }, { "epoch": 0.1075178823414919, "grad_norm": 2.5728751246579917, "learning_rate": 3.921812433092791e-05, "loss": 1.2982, "step": 1381 }, { "epoch": 0.10759573743370153, "grad_norm": 3.0426306681424324, "learning_rate": 3.921674163096689e-05, "loss": 1.424, "step": 1382 }, { "epoch": 0.10767359252591115, "grad_norm": 2.8229241869964987, "learning_rate": 3.92153577338932e-05, "loss": 1.2955, "step": 1383 }, { "epoch": 0.10775144761812078, "grad_norm": 2.8806482739736627, "learning_rate": 3.921397263979303e-05, "loss": 1.3661, "step": 1384 }, { "epoch": 0.1078293027103304, "grad_norm": 2.873082877870848, "learning_rate": 3.921258634875269e-05, "loss": 1.3964, "step": 1385 }, { "epoch": 0.10790715780254002, "grad_norm": 2.662355675123103, "learning_rate": 3.921119886085853e-05, "loss": 1.3772, "step": 1386 }, { "epoch": 0.10798501289474964, "grad_norm": 2.62733260287023, "learning_rate": 3.9209810176196974e-05, "loss": 1.3871, "step": 1387 }, { "epoch": 0.10806286798695927, "grad_norm": 2.6924291499952937, "learning_rate": 3.920842029485454e-05, "loss": 1.3874, "step": 1388 }, { "epoch": 0.1081407230791689, "grad_norm": 2.5565701643646612, "learning_rate": 3.920702921691781e-05, "loss": 1.3466, "step": 1389 }, { "epoch": 0.10821857817137852, "grad_norm": 2.7938264596458633, "learning_rate": 3.9205636942473436e-05, "loss": 1.4243, "step": 1390 }, { "epoch": 0.10829643326358815, "grad_norm": 2.611501767368263, "learning_rate": 3.920424347160816e-05, "loss": 1.3265, "step": 1391 }, { "epoch": 0.10837428835579777, "grad_norm": 2.596233656206118, "learning_rate": 3.920284880440879e-05, "loss": 1.3185, "step": 1392 }, { "epoch": 0.1084521434480074, "grad_norm": 2.6804119604558903, "learning_rate": 3.920145294096219e-05, "loss": 1.3064, "step": 1393 }, { "epoch": 0.10852999854021703, "grad_norm": 2.7436051288183148, "learning_rate": 3.920005588135533e-05, "loss": 1.3666, "step": 1394 }, { "epoch": 0.10860785363242664, "grad_norm": 2.7196820661200376, "learning_rate": 3.9198657625675236e-05, "loss": 1.3775, "step": 1395 }, { "epoch": 0.10868570872463627, "grad_norm": 2.6888637355791625, "learning_rate": 3.919725817400902e-05, "loss": 1.3202, "step": 1396 }, { "epoch": 0.10876356381684589, "grad_norm": 2.607300364198731, "learning_rate": 3.919585752644385e-05, "loss": 1.276, "step": 1397 }, { "epoch": 0.10884141890905552, "grad_norm": 2.5326871426980335, "learning_rate": 3.919445568306699e-05, "loss": 1.3256, "step": 1398 }, { "epoch": 0.10891927400126515, "grad_norm": 2.8062442736196584, "learning_rate": 3.919305264396575e-05, "loss": 1.355, "step": 1399 }, { "epoch": 0.10899712909347477, "grad_norm": 2.6855914709061253, "learning_rate": 3.9191648409227554e-05, "loss": 1.3386, "step": 1400 }, { "epoch": 0.10899712909347477, "eval_loss": 0.1704782247543335, "eval_runtime": 163.1016, "eval_samples_per_second": 17.658, "eval_steps_per_second": 0.632, "step": 1400 }, { "epoch": 0.1090749841856844, "grad_norm": 2.7927485665919094, "learning_rate": 3.919024297893987e-05, "loss": 1.3575, "step": 1401 }, { "epoch": 0.10915283927789401, "grad_norm": 2.82673399853896, "learning_rate": 3.918883635319025e-05, "loss": 1.3224, "step": 1402 }, { "epoch": 0.10923069437010365, "grad_norm": 3.1195330934157752, "learning_rate": 3.918742853206632e-05, "loss": 1.3742, "step": 1403 }, { "epoch": 0.10930854946231328, "grad_norm": 2.9767668205170765, "learning_rate": 3.918601951565578e-05, "loss": 1.4519, "step": 1404 }, { "epoch": 0.10938640455452289, "grad_norm": 3.0135977448642506, "learning_rate": 3.9184609304046416e-05, "loss": 1.3186, "step": 1405 }, { "epoch": 0.10946425964673252, "grad_norm": 2.8017566408106336, "learning_rate": 3.918319789732606e-05, "loss": 1.2965, "step": 1406 }, { "epoch": 0.10954211473894214, "grad_norm": 2.729017110578763, "learning_rate": 3.918178529558265e-05, "loss": 1.3495, "step": 1407 }, { "epoch": 0.10961996983115177, "grad_norm": 2.7881492366749705, "learning_rate": 3.918037149890417e-05, "loss": 1.3158, "step": 1408 }, { "epoch": 0.10969782492336139, "grad_norm": 2.7856939341361118, "learning_rate": 3.91789565073787e-05, "loss": 1.3723, "step": 1409 }, { "epoch": 0.10977568001557102, "grad_norm": 2.9418662195794028, "learning_rate": 3.917754032109439e-05, "loss": 1.3661, "step": 1410 }, { "epoch": 0.10985353510778065, "grad_norm": 2.7596999839744614, "learning_rate": 3.917612294013947e-05, "loss": 1.3358, "step": 1411 }, { "epoch": 0.10993139019999026, "grad_norm": 2.6420191129012163, "learning_rate": 3.9174704364602216e-05, "loss": 1.2612, "step": 1412 }, { "epoch": 0.1100092452921999, "grad_norm": 2.7342378706241544, "learning_rate": 3.917328459457101e-05, "loss": 1.3878, "step": 1413 }, { "epoch": 0.11008710038440951, "grad_norm": 2.99360865458388, "learning_rate": 3.917186363013431e-05, "loss": 1.3531, "step": 1414 }, { "epoch": 0.11016495547661914, "grad_norm": 2.7487133509383863, "learning_rate": 3.9170441471380604e-05, "loss": 1.3705, "step": 1415 }, { "epoch": 0.11024281056882877, "grad_norm": 3.1249150668745234, "learning_rate": 3.916901811839851e-05, "loss": 1.4463, "step": 1416 }, { "epoch": 0.11032066566103839, "grad_norm": 2.920262201192036, "learning_rate": 3.916759357127669e-05, "loss": 1.3818, "step": 1417 }, { "epoch": 0.11039852075324802, "grad_norm": 2.728267624931538, "learning_rate": 3.916616783010389e-05, "loss": 1.2799, "step": 1418 }, { "epoch": 0.11047637584545764, "grad_norm": 2.6741857995853797, "learning_rate": 3.916474089496892e-05, "loss": 1.278, "step": 1419 }, { "epoch": 0.11055423093766727, "grad_norm": 2.5426111334291366, "learning_rate": 3.916331276596067e-05, "loss": 1.3728, "step": 1420 }, { "epoch": 0.1106320860298769, "grad_norm": 2.93764447518535, "learning_rate": 3.916188344316812e-05, "loss": 1.4363, "step": 1421 }, { "epoch": 0.11070994112208651, "grad_norm": 2.6881444869984668, "learning_rate": 3.916045292668029e-05, "loss": 1.3011, "step": 1422 }, { "epoch": 0.11078779621429614, "grad_norm": 2.635824903612047, "learning_rate": 3.915902121658631e-05, "loss": 1.3091, "step": 1423 }, { "epoch": 0.11086565130650576, "grad_norm": 2.675719516539105, "learning_rate": 3.915758831297536e-05, "loss": 1.2561, "step": 1424 }, { "epoch": 0.11094350639871539, "grad_norm": 3.0142797679491324, "learning_rate": 3.9156154215936715e-05, "loss": 1.4122, "step": 1425 }, { "epoch": 0.11102136149092502, "grad_norm": 2.44464283288602, "learning_rate": 3.9154718925559694e-05, "loss": 1.2301, "step": 1426 }, { "epoch": 0.11109921658313464, "grad_norm": 2.8055566017374547, "learning_rate": 3.915328244193372e-05, "loss": 1.3246, "step": 1427 }, { "epoch": 0.11117707167534427, "grad_norm": 2.8745734353181507, "learning_rate": 3.915184476514829e-05, "loss": 1.3002, "step": 1428 }, { "epoch": 0.11125492676755389, "grad_norm": 2.8559476362887235, "learning_rate": 3.915040589529294e-05, "loss": 1.3673, "step": 1429 }, { "epoch": 0.11133278185976352, "grad_norm": 2.6861596613535914, "learning_rate": 3.9148965832457325e-05, "loss": 1.2721, "step": 1430 }, { "epoch": 0.11141063695197315, "grad_norm": 2.4252318154988406, "learning_rate": 3.914752457673114e-05, "loss": 1.2716, "step": 1431 }, { "epoch": 0.11148849204418276, "grad_norm": 2.6629100558402414, "learning_rate": 3.914608212820418e-05, "loss": 1.386, "step": 1432 }, { "epoch": 0.1115663471363924, "grad_norm": 2.767036769944377, "learning_rate": 3.91446384869663e-05, "loss": 1.3289, "step": 1433 }, { "epoch": 0.11164420222860201, "grad_norm": 2.6986837155474532, "learning_rate": 3.9143193653107426e-05, "loss": 1.3764, "step": 1434 }, { "epoch": 0.11172205732081164, "grad_norm": 3.1836696331498624, "learning_rate": 3.914174762671757e-05, "loss": 1.3046, "step": 1435 }, { "epoch": 0.11179991241302126, "grad_norm": 2.433133402736127, "learning_rate": 3.914030040788681e-05, "loss": 1.2342, "step": 1436 }, { "epoch": 0.11187776750523089, "grad_norm": 2.868424458849673, "learning_rate": 3.91388519967053e-05, "loss": 1.298, "step": 1437 }, { "epoch": 0.11195562259744052, "grad_norm": 2.7355179897068584, "learning_rate": 3.913740239326328e-05, "loss": 1.3241, "step": 1438 }, { "epoch": 0.11203347768965013, "grad_norm": 2.6301020598078737, "learning_rate": 3.913595159765104e-05, "loss": 1.3068, "step": 1439 }, { "epoch": 0.11211133278185977, "grad_norm": 3.0086345724849037, "learning_rate": 3.9134499609958956e-05, "loss": 1.31, "step": 1440 }, { "epoch": 0.11218918787406938, "grad_norm": 2.643150156707732, "learning_rate": 3.913304643027749e-05, "loss": 1.3341, "step": 1441 }, { "epoch": 0.11226704296627901, "grad_norm": 3.124371838953365, "learning_rate": 3.913159205869717e-05, "loss": 1.3212, "step": 1442 }, { "epoch": 0.11234489805848864, "grad_norm": 2.6739741281367038, "learning_rate": 3.913013649530858e-05, "loss": 1.3695, "step": 1443 }, { "epoch": 0.11242275315069826, "grad_norm": 2.9086020579866787, "learning_rate": 3.9128679740202414e-05, "loss": 1.1964, "step": 1444 }, { "epoch": 0.11250060824290789, "grad_norm": 2.796254168774308, "learning_rate": 3.9127221793469414e-05, "loss": 1.3442, "step": 1445 }, { "epoch": 0.1125784633351175, "grad_norm": 3.0700308409197543, "learning_rate": 3.9125762655200395e-05, "loss": 1.3883, "step": 1446 }, { "epoch": 0.11265631842732714, "grad_norm": 2.891060704563076, "learning_rate": 3.912430232548627e-05, "loss": 1.371, "step": 1447 }, { "epoch": 0.11273417351953677, "grad_norm": 2.7312593037674766, "learning_rate": 3.9122840804418e-05, "loss": 1.3297, "step": 1448 }, { "epoch": 0.11281202861174638, "grad_norm": 2.711081723587506, "learning_rate": 3.912137809208663e-05, "loss": 1.3297, "step": 1449 }, { "epoch": 0.11288988370395601, "grad_norm": 2.979001820976936, "learning_rate": 3.911991418858328e-05, "loss": 1.2764, "step": 1450 }, { "epoch": 0.11288988370395601, "eval_loss": 0.16355931758880615, "eval_runtime": 162.05, "eval_samples_per_second": 17.772, "eval_steps_per_second": 0.636, "step": 1450 }, { "epoch": 0.11296773879616563, "grad_norm": 2.553486141042278, "learning_rate": 3.911844909399916e-05, "loss": 1.268, "step": 1451 }, { "epoch": 0.11304559388837526, "grad_norm": 3.055265763707771, "learning_rate": 3.911698280842551e-05, "loss": 1.3888, "step": 1452 }, { "epoch": 0.11312344898058489, "grad_norm": 2.712504663231617, "learning_rate": 3.91155153319537e-05, "loss": 1.3354, "step": 1453 }, { "epoch": 0.11320130407279451, "grad_norm": 2.9066436796448283, "learning_rate": 3.9114046664675134e-05, "loss": 1.3317, "step": 1454 }, { "epoch": 0.11327915916500414, "grad_norm": 2.7063576304480805, "learning_rate": 3.9112576806681304e-05, "loss": 1.4096, "step": 1455 }, { "epoch": 0.11335701425721376, "grad_norm": 2.6749309954960694, "learning_rate": 3.911110575806378e-05, "loss": 1.2623, "step": 1456 }, { "epoch": 0.11343486934942339, "grad_norm": 2.6152412216946908, "learning_rate": 3.910963351891419e-05, "loss": 1.2462, "step": 1457 }, { "epoch": 0.11351272444163302, "grad_norm": 2.842794784246309, "learning_rate": 3.910816008932426e-05, "loss": 1.3593, "step": 1458 }, { "epoch": 0.11359057953384263, "grad_norm": 2.753219656906981, "learning_rate": 3.910668546938577e-05, "loss": 1.3316, "step": 1459 }, { "epoch": 0.11366843462605226, "grad_norm": 2.702707410632608, "learning_rate": 3.910520965919059e-05, "loss": 1.2577, "step": 1460 }, { "epoch": 0.11374628971826188, "grad_norm": 3.0092103888655886, "learning_rate": 3.910373265883064e-05, "loss": 1.2907, "step": 1461 }, { "epoch": 0.11382414481047151, "grad_norm": 2.7170596016385686, "learning_rate": 3.910225446839795e-05, "loss": 1.2812, "step": 1462 }, { "epoch": 0.11390199990268113, "grad_norm": 2.7717906261113137, "learning_rate": 3.9100775087984594e-05, "loss": 1.3727, "step": 1463 }, { "epoch": 0.11397985499489076, "grad_norm": 2.7603293989561584, "learning_rate": 3.909929451768273e-05, "loss": 1.3235, "step": 1464 }, { "epoch": 0.11405771008710039, "grad_norm": 3.2459473429181678, "learning_rate": 3.909781275758459e-05, "loss": 1.2674, "step": 1465 }, { "epoch": 0.11413556517931, "grad_norm": 2.650845123478446, "learning_rate": 3.9096329807782484e-05, "loss": 1.2687, "step": 1466 }, { "epoch": 0.11421342027151964, "grad_norm": 2.8908063425840504, "learning_rate": 3.9094845668368796e-05, "loss": 1.3354, "step": 1467 }, { "epoch": 0.11429127536372925, "grad_norm": 3.3540194086263555, "learning_rate": 3.909336033943597e-05, "loss": 1.2725, "step": 1468 }, { "epoch": 0.11436913045593888, "grad_norm": 3.0343549401965673, "learning_rate": 3.909187382107655e-05, "loss": 1.3427, "step": 1469 }, { "epoch": 0.11444698554814851, "grad_norm": 2.8641866513656256, "learning_rate": 3.909038611338313e-05, "loss": 1.2517, "step": 1470 }, { "epoch": 0.11452484064035813, "grad_norm": 2.629470723001076, "learning_rate": 3.9088897216448376e-05, "loss": 1.2529, "step": 1471 }, { "epoch": 0.11460269573256776, "grad_norm": 3.200828359038233, "learning_rate": 3.908740713036506e-05, "loss": 1.3168, "step": 1472 }, { "epoch": 0.11468055082477738, "grad_norm": 2.7355808093542744, "learning_rate": 3.9085915855226e-05, "loss": 1.2476, "step": 1473 }, { "epoch": 0.11475840591698701, "grad_norm": 3.1567194452023664, "learning_rate": 3.908442339112409e-05, "loss": 1.2558, "step": 1474 }, { "epoch": 0.11483626100919664, "grad_norm": 2.8006263497643786, "learning_rate": 3.908292973815231e-05, "loss": 1.3051, "step": 1475 }, { "epoch": 0.11491411610140626, "grad_norm": 3.113596617607996, "learning_rate": 3.90814348964037e-05, "loss": 1.2891, "step": 1476 }, { "epoch": 0.11499197119361589, "grad_norm": 3.273030307509974, "learning_rate": 3.9079938865971387e-05, "loss": 1.3514, "step": 1477 }, { "epoch": 0.1150698262858255, "grad_norm": 2.777251240284016, "learning_rate": 3.907844164694857e-05, "loss": 1.2988, "step": 1478 }, { "epoch": 0.11514768137803513, "grad_norm": 3.0700124274959792, "learning_rate": 3.9076943239428516e-05, "loss": 1.2418, "step": 1479 }, { "epoch": 0.11522553647024476, "grad_norm": 2.737213850937718, "learning_rate": 3.9075443643504563e-05, "loss": 1.2188, "step": 1480 }, { "epoch": 0.11530339156245438, "grad_norm": 2.985239002205463, "learning_rate": 3.907394285927014e-05, "loss": 1.3522, "step": 1481 }, { "epoch": 0.11538124665466401, "grad_norm": 2.81278126312474, "learning_rate": 3.907244088681873e-05, "loss": 1.2751, "step": 1482 }, { "epoch": 0.11545910174687363, "grad_norm": 2.843267541034675, "learning_rate": 3.90709377262439e-05, "loss": 1.3208, "step": 1483 }, { "epoch": 0.11553695683908326, "grad_norm": 3.0253390674059397, "learning_rate": 3.9069433377639285e-05, "loss": 1.2685, "step": 1484 }, { "epoch": 0.11561481193129289, "grad_norm": 3.104604804473773, "learning_rate": 3.9067927841098614e-05, "loss": 1.2223, "step": 1485 }, { "epoch": 0.1156926670235025, "grad_norm": 2.967003033392855, "learning_rate": 3.9066421116715665e-05, "loss": 1.3899, "step": 1486 }, { "epoch": 0.11577052211571214, "grad_norm": 3.243126406243535, "learning_rate": 3.9064913204584296e-05, "loss": 1.3901, "step": 1487 }, { "epoch": 0.11584837720792175, "grad_norm": 2.81203513863753, "learning_rate": 3.9063404104798454e-05, "loss": 1.2989, "step": 1488 }, { "epoch": 0.11592623230013138, "grad_norm": 3.2692787311106275, "learning_rate": 3.906189381745213e-05, "loss": 1.3283, "step": 1489 }, { "epoch": 0.116004087392341, "grad_norm": 2.859079117862023, "learning_rate": 3.906038234263943e-05, "loss": 1.3352, "step": 1490 }, { "epoch": 0.11608194248455063, "grad_norm": 3.1848692689887264, "learning_rate": 3.90588696804545e-05, "loss": 1.2372, "step": 1491 }, { "epoch": 0.11615979757676026, "grad_norm": 2.6646801851922177, "learning_rate": 3.905735583099157e-05, "loss": 1.2271, "step": 1492 }, { "epoch": 0.11623765266896988, "grad_norm": 2.9132865134114776, "learning_rate": 3.905584079434495e-05, "loss": 1.2208, "step": 1493 }, { "epoch": 0.1163155077611795, "grad_norm": 2.664052324881177, "learning_rate": 3.9054324570609026e-05, "loss": 1.2067, "step": 1494 }, { "epoch": 0.11639336285338912, "grad_norm": 2.767429770316771, "learning_rate": 3.905280715987824e-05, "loss": 1.2475, "step": 1495 }, { "epoch": 0.11647121794559875, "grad_norm": 2.577830889777106, "learning_rate": 3.9051288562247125e-05, "loss": 1.2212, "step": 1496 }, { "epoch": 0.11654907303780838, "grad_norm": 2.729029336894543, "learning_rate": 3.9049768777810277e-05, "loss": 1.2718, "step": 1497 }, { "epoch": 0.116626928130018, "grad_norm": 2.9210452655011774, "learning_rate": 3.904824780666238e-05, "loss": 1.2139, "step": 1498 }, { "epoch": 0.11670478322222763, "grad_norm": 2.785593948722937, "learning_rate": 3.904672564889817e-05, "loss": 1.2681, "step": 1499 }, { "epoch": 0.11678263831443725, "grad_norm": 2.599643913890701, "learning_rate": 3.904520230461249e-05, "loss": 1.2984, "step": 1500 }, { "epoch": 0.11678263831443725, "eval_loss": 0.15780694782733917, "eval_runtime": 162.4558, "eval_samples_per_second": 17.728, "eval_steps_per_second": 0.634, "step": 1500 }, { "epoch": 0.11686049340664688, "grad_norm": 3.0169971165438056, "learning_rate": 3.904367777390023e-05, "loss": 1.2992, "step": 1501 }, { "epoch": 0.11693834849885651, "grad_norm": 2.6483058897302794, "learning_rate": 3.9042152056856354e-05, "loss": 1.288, "step": 1502 }, { "epoch": 0.11701620359106613, "grad_norm": 2.7421052718348946, "learning_rate": 3.9040625153575905e-05, "loss": 1.2369, "step": 1503 }, { "epoch": 0.11709405868327576, "grad_norm": 2.4769269336077704, "learning_rate": 3.903909706415402e-05, "loss": 1.22, "step": 1504 }, { "epoch": 0.11717191377548537, "grad_norm": 2.5985128133744797, "learning_rate": 3.9037567788685865e-05, "loss": 1.1665, "step": 1505 }, { "epoch": 0.117249768867695, "grad_norm": 2.6734742371050526, "learning_rate": 3.903603732726673e-05, "loss": 1.3467, "step": 1506 }, { "epoch": 0.11732762395990463, "grad_norm": 2.8512608338662524, "learning_rate": 3.903450567999194e-05, "loss": 1.3364, "step": 1507 }, { "epoch": 0.11740547905211425, "grad_norm": 2.966868261829137, "learning_rate": 3.9032972846956925e-05, "loss": 1.2072, "step": 1508 }, { "epoch": 0.11748333414432388, "grad_norm": 2.5821121903580937, "learning_rate": 3.903143882825716e-05, "loss": 1.1919, "step": 1509 }, { "epoch": 0.1175611892365335, "grad_norm": 2.76374994930607, "learning_rate": 3.902990362398821e-05, "loss": 1.3438, "step": 1510 }, { "epoch": 0.11763904432874313, "grad_norm": 2.7405817199309044, "learning_rate": 3.902836723424572e-05, "loss": 1.3267, "step": 1511 }, { "epoch": 0.11771689942095276, "grad_norm": 2.540457563947611, "learning_rate": 3.902682965912538e-05, "loss": 1.2686, "step": 1512 }, { "epoch": 0.11779475451316238, "grad_norm": 2.6232051506902794, "learning_rate": 3.9025290898723e-05, "loss": 1.2514, "step": 1513 }, { "epoch": 0.117872609605372, "grad_norm": 2.6265585782959233, "learning_rate": 3.9023750953134416e-05, "loss": 1.2433, "step": 1514 }, { "epoch": 0.11795046469758162, "grad_norm": 2.697789055999897, "learning_rate": 3.902220982245557e-05, "loss": 1.236, "step": 1515 }, { "epoch": 0.11802831978979125, "grad_norm": 2.871908263010412, "learning_rate": 3.902066750678246e-05, "loss": 1.2888, "step": 1516 }, { "epoch": 0.11810617488200087, "grad_norm": 2.585713457664097, "learning_rate": 3.901912400621117e-05, "loss": 1.2583, "step": 1517 }, { "epoch": 0.1181840299742105, "grad_norm": 2.772269241890005, "learning_rate": 3.901757932083785e-05, "loss": 1.2705, "step": 1518 }, { "epoch": 0.11826188506642013, "grad_norm": 2.7292275558493593, "learning_rate": 3.901603345075874e-05, "loss": 1.2545, "step": 1519 }, { "epoch": 0.11833974015862975, "grad_norm": 2.6109328467267687, "learning_rate": 3.9014486396070125e-05, "loss": 1.2923, "step": 1520 }, { "epoch": 0.11841759525083938, "grad_norm": 2.6864451092540618, "learning_rate": 3.901293815686838e-05, "loss": 1.2466, "step": 1521 }, { "epoch": 0.118495450343049, "grad_norm": 2.6998897131474404, "learning_rate": 3.901138873324995e-05, "loss": 1.2387, "step": 1522 }, { "epoch": 0.11857330543525862, "grad_norm": 2.71122557799965, "learning_rate": 3.900983812531138e-05, "loss": 1.2093, "step": 1523 }, { "epoch": 0.11865116052746826, "grad_norm": 2.6702965250342547, "learning_rate": 3.900828633314924e-05, "loss": 1.2372, "step": 1524 }, { "epoch": 0.11872901561967787, "grad_norm": 2.7195128681317344, "learning_rate": 3.90067333568602e-05, "loss": 1.2344, "step": 1525 }, { "epoch": 0.1188068707118875, "grad_norm": 2.7319297115273047, "learning_rate": 3.9005179196541025e-05, "loss": 1.275, "step": 1526 }, { "epoch": 0.11888472580409712, "grad_norm": 2.8068387995491126, "learning_rate": 3.900362385228851e-05, "loss": 1.2367, "step": 1527 }, { "epoch": 0.11896258089630675, "grad_norm": 2.6439003917113264, "learning_rate": 3.900206732419956e-05, "loss": 1.2624, "step": 1528 }, { "epoch": 0.11904043598851638, "grad_norm": 2.7102278923814573, "learning_rate": 3.900050961237113e-05, "loss": 1.2748, "step": 1529 }, { "epoch": 0.119118291080726, "grad_norm": 2.760500452928646, "learning_rate": 3.8998950716900254e-05, "loss": 1.2571, "step": 1530 }, { "epoch": 0.11919614617293563, "grad_norm": 2.570961429128576, "learning_rate": 3.899739063788406e-05, "loss": 1.2358, "step": 1531 }, { "epoch": 0.11927400126514524, "grad_norm": 2.8526276858930113, "learning_rate": 3.899582937541973e-05, "loss": 1.2857, "step": 1532 }, { "epoch": 0.11935185635735487, "grad_norm": 2.698054509905325, "learning_rate": 3.899426692960451e-05, "loss": 1.2705, "step": 1533 }, { "epoch": 0.1194297114495645, "grad_norm": 2.9631953540881466, "learning_rate": 3.8992703300535745e-05, "loss": 1.2495, "step": 1534 }, { "epoch": 0.11950756654177412, "grad_norm": 2.4894784837133948, "learning_rate": 3.899113848831083e-05, "loss": 1.1786, "step": 1535 }, { "epoch": 0.11958542163398375, "grad_norm": 3.098224687359328, "learning_rate": 3.8989572493027264e-05, "loss": 1.2276, "step": 1536 }, { "epoch": 0.11966327672619337, "grad_norm": 2.9839172542661454, "learning_rate": 3.8988005314782584e-05, "loss": 1.215, "step": 1537 }, { "epoch": 0.119741131818403, "grad_norm": 2.757543891746117, "learning_rate": 3.898643695367443e-05, "loss": 1.2476, "step": 1538 }, { "epoch": 0.11981898691061262, "grad_norm": 3.0741917909920584, "learning_rate": 3.898486740980049e-05, "loss": 1.2411, "step": 1539 }, { "epoch": 0.11989684200282225, "grad_norm": 2.7087901196928637, "learning_rate": 3.898329668325856e-05, "loss": 1.1865, "step": 1540 }, { "epoch": 0.11997469709503188, "grad_norm": 2.907069578715173, "learning_rate": 3.898172477414646e-05, "loss": 1.239, "step": 1541 }, { "epoch": 0.12005255218724149, "grad_norm": 2.773309989536192, "learning_rate": 3.8980151682562137e-05, "loss": 1.1866, "step": 1542 }, { "epoch": 0.12013040727945112, "grad_norm": 2.7681282898059947, "learning_rate": 3.8978577408603575e-05, "loss": 1.3029, "step": 1543 }, { "epoch": 0.12020826237166074, "grad_norm": 2.8082111847743207, "learning_rate": 3.897700195236885e-05, "loss": 1.1949, "step": 1544 }, { "epoch": 0.12028611746387037, "grad_norm": 2.914167370217377, "learning_rate": 3.8975425313956104e-05, "loss": 1.2307, "step": 1545 }, { "epoch": 0.12036397255608, "grad_norm": 2.659871864211904, "learning_rate": 3.897384749346356e-05, "loss": 1.1809, "step": 1546 }, { "epoch": 0.12044182764828962, "grad_norm": 2.9899561310530576, "learning_rate": 3.897226849098949e-05, "loss": 1.2731, "step": 1547 }, { "epoch": 0.12051968274049925, "grad_norm": 2.9796067932963815, "learning_rate": 3.897068830663227e-05, "loss": 1.2838, "step": 1548 }, { "epoch": 0.12059753783270886, "grad_norm": 2.9928991280672337, "learning_rate": 3.896910694049035e-05, "loss": 1.2465, "step": 1549 }, { "epoch": 0.1206753929249185, "grad_norm": 2.7664191183108464, "learning_rate": 3.896752439266222e-05, "loss": 1.2282, "step": 1550 }, { "epoch": 0.1206753929249185, "eval_loss": 0.15440422296524048, "eval_runtime": 162.219, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 1550 }, { "epoch": 0.12075324801712813, "grad_norm": 2.8442325673376736, "learning_rate": 3.8965940663246485e-05, "loss": 1.1994, "step": 1551 }, { "epoch": 0.12083110310933774, "grad_norm": 2.721792604275934, "learning_rate": 3.896435575234179e-05, "loss": 1.2497, "step": 1552 }, { "epoch": 0.12090895820154737, "grad_norm": 2.6439698915980556, "learning_rate": 3.896276966004687e-05, "loss": 1.1756, "step": 1553 }, { "epoch": 0.12098681329375699, "grad_norm": 2.984176251105126, "learning_rate": 3.896118238646054e-05, "loss": 1.2379, "step": 1554 }, { "epoch": 0.12106466838596662, "grad_norm": 2.615086529883257, "learning_rate": 3.8959593931681666e-05, "loss": 1.2646, "step": 1555 }, { "epoch": 0.12114252347817625, "grad_norm": 2.4836375668276633, "learning_rate": 3.895800429580921e-05, "loss": 1.1627, "step": 1556 }, { "epoch": 0.12122037857038587, "grad_norm": 2.5853234861174363, "learning_rate": 3.895641347894219e-05, "loss": 1.213, "step": 1557 }, { "epoch": 0.1212982336625955, "grad_norm": 2.818658832624075, "learning_rate": 3.895482148117972e-05, "loss": 1.2603, "step": 1558 }, { "epoch": 0.12137608875480511, "grad_norm": 2.6302634053484146, "learning_rate": 3.895322830262097e-05, "loss": 1.1505, "step": 1559 }, { "epoch": 0.12145394384701474, "grad_norm": 2.6968932893783446, "learning_rate": 3.8951633943365176e-05, "loss": 1.1361, "step": 1560 }, { "epoch": 0.12153179893922438, "grad_norm": 2.826413863293549, "learning_rate": 3.895003840351168e-05, "loss": 1.2296, "step": 1561 }, { "epoch": 0.12160965403143399, "grad_norm": 2.7008514433284985, "learning_rate": 3.8948441683159855e-05, "loss": 1.1864, "step": 1562 }, { "epoch": 0.12168750912364362, "grad_norm": 3.0049939575208704, "learning_rate": 3.894684378240918e-05, "loss": 1.2501, "step": 1563 }, { "epoch": 0.12176536421585324, "grad_norm": 2.7223086336336357, "learning_rate": 3.894524470135919e-05, "loss": 1.2064, "step": 1564 }, { "epoch": 0.12184321930806287, "grad_norm": 2.6001855030454553, "learning_rate": 3.894364444010952e-05, "loss": 1.1488, "step": 1565 }, { "epoch": 0.12192107440027249, "grad_norm": 2.494969994802243, "learning_rate": 3.8942042998759834e-05, "loss": 1.1632, "step": 1566 }, { "epoch": 0.12199892949248212, "grad_norm": 2.5864342808907286, "learning_rate": 3.894044037740991e-05, "loss": 1.232, "step": 1567 }, { "epoch": 0.12207678458469175, "grad_norm": 2.4448843485650964, "learning_rate": 3.8938836576159575e-05, "loss": 1.1839, "step": 1568 }, { "epoch": 0.12215463967690136, "grad_norm": 2.471617035887821, "learning_rate": 3.893723159510875e-05, "loss": 1.2432, "step": 1569 }, { "epoch": 0.122232494769111, "grad_norm": 2.583799810753158, "learning_rate": 3.893562543435739e-05, "loss": 1.2361, "step": 1570 }, { "epoch": 0.12231034986132061, "grad_norm": 2.6134638996256334, "learning_rate": 3.8934018094005586e-05, "loss": 1.2089, "step": 1571 }, { "epoch": 0.12238820495353024, "grad_norm": 2.594766610717827, "learning_rate": 3.893240957415345e-05, "loss": 1.1364, "step": 1572 }, { "epoch": 0.12246606004573987, "grad_norm": 2.6545853624545415, "learning_rate": 3.8930799874901186e-05, "loss": 1.1875, "step": 1573 }, { "epoch": 0.12254391513794949, "grad_norm": 2.8731278918254834, "learning_rate": 3.8929188996349074e-05, "loss": 1.1909, "step": 1574 }, { "epoch": 0.12262177023015912, "grad_norm": 2.5885252569893566, "learning_rate": 3.8927576938597464e-05, "loss": 1.1665, "step": 1575 }, { "epoch": 0.12269962532236874, "grad_norm": 2.906352237272357, "learning_rate": 3.892596370174678e-05, "loss": 1.2712, "step": 1576 }, { "epoch": 0.12277748041457837, "grad_norm": 2.82466647328827, "learning_rate": 3.892434928589751e-05, "loss": 1.2948, "step": 1577 }, { "epoch": 0.122855335506788, "grad_norm": 2.61403333306259, "learning_rate": 3.892273369115024e-05, "loss": 1.2932, "step": 1578 }, { "epoch": 0.12293319059899761, "grad_norm": 3.0008274397566277, "learning_rate": 3.89211169176056e-05, "loss": 1.342, "step": 1579 }, { "epoch": 0.12301104569120724, "grad_norm": 2.734801322611217, "learning_rate": 3.891949896536431e-05, "loss": 1.3165, "step": 1580 }, { "epoch": 0.12308890078341686, "grad_norm": 2.806898149405837, "learning_rate": 3.891787983452717e-05, "loss": 1.2879, "step": 1581 }, { "epoch": 0.12316675587562649, "grad_norm": 2.651573518631926, "learning_rate": 3.8916259525195034e-05, "loss": 1.1253, "step": 1582 }, { "epoch": 0.12324461096783612, "grad_norm": 2.7573389350720103, "learning_rate": 3.891463803746885e-05, "loss": 1.2605, "step": 1583 }, { "epoch": 0.12332246606004574, "grad_norm": 2.848794569486142, "learning_rate": 3.8913015371449615e-05, "loss": 1.2181, "step": 1584 }, { "epoch": 0.12340032115225537, "grad_norm": 2.805747304344384, "learning_rate": 3.8911391527238426e-05, "loss": 1.2728, "step": 1585 }, { "epoch": 0.12347817624446498, "grad_norm": 2.921144190752731, "learning_rate": 3.8909766504936437e-05, "loss": 1.208, "step": 1586 }, { "epoch": 0.12355603133667462, "grad_norm": 3.0068741145433036, "learning_rate": 3.8908140304644877e-05, "loss": 1.2768, "step": 1587 }, { "epoch": 0.12363388642888425, "grad_norm": 2.5226594801745894, "learning_rate": 3.890651292646505e-05, "loss": 1.2121, "step": 1588 }, { "epoch": 0.12371174152109386, "grad_norm": 2.8327528111630826, "learning_rate": 3.890488437049834e-05, "loss": 1.2282, "step": 1589 }, { "epoch": 0.1237895966133035, "grad_norm": 2.645437237035061, "learning_rate": 3.890325463684619e-05, "loss": 1.2032, "step": 1590 }, { "epoch": 0.12386745170551311, "grad_norm": 2.61843541303347, "learning_rate": 3.8901623725610134e-05, "loss": 1.1127, "step": 1591 }, { "epoch": 0.12394530679772274, "grad_norm": 3.157744716135458, "learning_rate": 3.889999163689177e-05, "loss": 1.2321, "step": 1592 }, { "epoch": 0.12402316188993236, "grad_norm": 2.656441506223948, "learning_rate": 3.889835837079275e-05, "loss": 1.1538, "step": 1593 }, { "epoch": 0.12410101698214199, "grad_norm": 2.8095877634379693, "learning_rate": 3.889672392741485e-05, "loss": 1.1519, "step": 1594 }, { "epoch": 0.12417887207435162, "grad_norm": 2.615137791174813, "learning_rate": 3.8895088306859864e-05, "loss": 1.2291, "step": 1595 }, { "epoch": 0.12425672716656123, "grad_norm": 2.8669251176521935, "learning_rate": 3.8893451509229685e-05, "loss": 1.1996, "step": 1596 }, { "epoch": 0.12433458225877086, "grad_norm": 2.6954313206182534, "learning_rate": 3.889181353462629e-05, "loss": 1.1375, "step": 1597 }, { "epoch": 0.12441243735098048, "grad_norm": 2.855118477214201, "learning_rate": 3.889017438315171e-05, "loss": 1.1986, "step": 1598 }, { "epoch": 0.12449029244319011, "grad_norm": 2.686712104920814, "learning_rate": 3.888853405490806e-05, "loss": 1.216, "step": 1599 }, { "epoch": 0.12456814753539974, "grad_norm": 2.64255745395529, "learning_rate": 3.888689254999753e-05, "loss": 1.1885, "step": 1600 }, { "epoch": 0.12456814753539974, "eval_loss": 0.14783307909965515, "eval_runtime": 162.5011, "eval_samples_per_second": 17.723, "eval_steps_per_second": 0.634, "step": 1600 }, { "epoch": 0.12464600262760936, "grad_norm": 2.7093413573963088, "learning_rate": 3.8885249868522355e-05, "loss": 1.2125, "step": 1601 }, { "epoch": 0.12472385771981899, "grad_norm": 2.790874158252824, "learning_rate": 3.8883606010584894e-05, "loss": 1.1669, "step": 1602 }, { "epoch": 0.1248017128120286, "grad_norm": 2.682872920905581, "learning_rate": 3.888196097628753e-05, "loss": 1.246, "step": 1603 }, { "epoch": 0.12487956790423824, "grad_norm": 2.5209872540131704, "learning_rate": 3.888031476573275e-05, "loss": 1.1344, "step": 1604 }, { "epoch": 0.12495742299644787, "grad_norm": 2.58536027886587, "learning_rate": 3.887866737902311e-05, "loss": 1.1574, "step": 1605 }, { "epoch": 0.1250352780886575, "grad_norm": 2.5562185241638136, "learning_rate": 3.887701881626123e-05, "loss": 1.1841, "step": 1606 }, { "epoch": 0.12511313318086711, "grad_norm": 2.6869145366139686, "learning_rate": 3.887536907754981e-05, "loss": 1.2675, "step": 1607 }, { "epoch": 0.12519098827307673, "grad_norm": 2.6509098407800935, "learning_rate": 3.887371816299161e-05, "loss": 1.2248, "step": 1608 }, { "epoch": 0.12526884336528635, "grad_norm": 2.5963462245360778, "learning_rate": 3.887206607268949e-05, "loss": 1.0911, "step": 1609 }, { "epoch": 0.125346698457496, "grad_norm": 2.727741053028032, "learning_rate": 3.887041280674636e-05, "loss": 1.294, "step": 1610 }, { "epoch": 0.1254245535497056, "grad_norm": 2.7567711388102123, "learning_rate": 3.886875836526521e-05, "loss": 1.2522, "step": 1611 }, { "epoch": 0.12550240864191523, "grad_norm": 2.6134466738070197, "learning_rate": 3.88671027483491e-05, "loss": 1.1802, "step": 1612 }, { "epoch": 0.12558026373412487, "grad_norm": 2.501232352915557, "learning_rate": 3.886544595610118e-05, "loss": 1.195, "step": 1613 }, { "epoch": 0.1256581188263345, "grad_norm": 2.766663636579171, "learning_rate": 3.8863787988624645e-05, "loss": 1.1599, "step": 1614 }, { "epoch": 0.1257359739185441, "grad_norm": 2.5198183323711114, "learning_rate": 3.8862128846022794e-05, "loss": 1.1537, "step": 1615 }, { "epoch": 0.12581382901075375, "grad_norm": 2.7971808458123557, "learning_rate": 3.8860468528398964e-05, "loss": 1.1646, "step": 1616 }, { "epoch": 0.12589168410296336, "grad_norm": 2.656265462082965, "learning_rate": 3.8858807035856606e-05, "loss": 1.2851, "step": 1617 }, { "epoch": 0.12596953919517298, "grad_norm": 3.1484091260161633, "learning_rate": 3.885714436849921e-05, "loss": 1.1643, "step": 1618 }, { "epoch": 0.1260473942873826, "grad_norm": 2.637517926840318, "learning_rate": 3.885548052643036e-05, "loss": 1.1815, "step": 1619 }, { "epoch": 0.12612524937959224, "grad_norm": 2.803402729954016, "learning_rate": 3.88538155097537e-05, "loss": 1.1534, "step": 1620 }, { "epoch": 0.12620310447180186, "grad_norm": 2.776488458734407, "learning_rate": 3.885214931857295e-05, "loss": 1.1989, "step": 1621 }, { "epoch": 0.12628095956401147, "grad_norm": 2.6400812707841936, "learning_rate": 3.8850481952991915e-05, "loss": 1.1415, "step": 1622 }, { "epoch": 0.12635881465622112, "grad_norm": 2.772710521858143, "learning_rate": 3.8848813413114464e-05, "loss": 1.2149, "step": 1623 }, { "epoch": 0.12643666974843074, "grad_norm": 2.5560909179771327, "learning_rate": 3.884714369904453e-05, "loss": 1.1595, "step": 1624 }, { "epoch": 0.12651452484064035, "grad_norm": 2.6048518231645765, "learning_rate": 3.884547281088613e-05, "loss": 1.2304, "step": 1625 }, { "epoch": 0.12659237993285, "grad_norm": 2.7540607467938716, "learning_rate": 3.8843800748743355e-05, "loss": 1.1844, "step": 1626 }, { "epoch": 0.1266702350250596, "grad_norm": 2.520849023658265, "learning_rate": 3.884212751272038e-05, "loss": 1.089, "step": 1627 }, { "epoch": 0.12674809011726923, "grad_norm": 2.645303743127532, "learning_rate": 3.8840453102921415e-05, "loss": 1.2554, "step": 1628 }, { "epoch": 0.12682594520947885, "grad_norm": 2.906785132434468, "learning_rate": 3.8838777519450783e-05, "loss": 1.2142, "step": 1629 }, { "epoch": 0.1269038003016885, "grad_norm": 2.6134505144612064, "learning_rate": 3.883710076241285e-05, "loss": 1.1979, "step": 1630 }, { "epoch": 0.1269816553938981, "grad_norm": 2.60412321928293, "learning_rate": 3.88354228319121e-05, "loss": 1.1509, "step": 1631 }, { "epoch": 0.12705951048610772, "grad_norm": 2.7639797115572584, "learning_rate": 3.883374372805303e-05, "loss": 1.2093, "step": 1632 }, { "epoch": 0.12713736557831737, "grad_norm": 2.9067914653929146, "learning_rate": 3.883206345094026e-05, "loss": 1.2037, "step": 1633 }, { "epoch": 0.12721522067052699, "grad_norm": 2.8172376447215806, "learning_rate": 3.8830382000678444e-05, "loss": 1.2427, "step": 1634 }, { "epoch": 0.1272930757627366, "grad_norm": 2.882500635925532, "learning_rate": 3.8828699377372355e-05, "loss": 1.1888, "step": 1635 }, { "epoch": 0.12737093085494622, "grad_norm": 2.6301282086854996, "learning_rate": 3.882701558112679e-05, "loss": 1.2143, "step": 1636 }, { "epoch": 0.12744878594715586, "grad_norm": 2.65515927532115, "learning_rate": 3.8825330612046646e-05, "loss": 1.2015, "step": 1637 }, { "epoch": 0.12752664103936548, "grad_norm": 2.4320842564302025, "learning_rate": 3.882364447023689e-05, "loss": 1.1302, "step": 1638 }, { "epoch": 0.1276044961315751, "grad_norm": 2.8353020025995015, "learning_rate": 3.882195715580257e-05, "loss": 1.2376, "step": 1639 }, { "epoch": 0.12768235122378474, "grad_norm": 2.66704735761859, "learning_rate": 3.882026866884878e-05, "loss": 1.1589, "step": 1640 }, { "epoch": 0.12776020631599436, "grad_norm": 2.7890514499128987, "learning_rate": 3.881857900948073e-05, "loss": 1.1419, "step": 1641 }, { "epoch": 0.12783806140820397, "grad_norm": 2.693833904451276, "learning_rate": 3.881688817780365e-05, "loss": 1.1572, "step": 1642 }, { "epoch": 0.12791591650041362, "grad_norm": 2.699094535077666, "learning_rate": 3.88151961739229e-05, "loss": 1.1181, "step": 1643 }, { "epoch": 0.12799377159262323, "grad_norm": 2.715712359025551, "learning_rate": 3.881350299794384e-05, "loss": 1.1311, "step": 1644 }, { "epoch": 0.12807162668483285, "grad_norm": 2.7253157345052696, "learning_rate": 3.8811808649972e-05, "loss": 1.137, "step": 1645 }, { "epoch": 0.12814948177704247, "grad_norm": 2.5934540090405096, "learning_rate": 3.881011313011289e-05, "loss": 1.1327, "step": 1646 }, { "epoch": 0.1282273368692521, "grad_norm": 2.640167781085633, "learning_rate": 3.880841643847216e-05, "loss": 1.0801, "step": 1647 }, { "epoch": 0.12830519196146173, "grad_norm": 2.686228096214694, "learning_rate": 3.880671857515548e-05, "loss": 1.2099, "step": 1648 }, { "epoch": 0.12838304705367135, "grad_norm": 2.76941826217227, "learning_rate": 3.880501954026864e-05, "loss": 1.2084, "step": 1649 }, { "epoch": 0.128460902145881, "grad_norm": 2.486500315567406, "learning_rate": 3.880331933391747e-05, "loss": 1.1153, "step": 1650 }, { "epoch": 0.128460902145881, "eval_loss": 0.1443619430065155, "eval_runtime": 162.7353, "eval_samples_per_second": 17.697, "eval_steps_per_second": 0.633, "step": 1650 }, { "epoch": 0.1285387572380906, "grad_norm": 2.502423472693318, "learning_rate": 3.8801617956207896e-05, "loss": 1.1038, "step": 1651 }, { "epoch": 0.12861661233030022, "grad_norm": 2.495199009459772, "learning_rate": 3.8799915407245894e-05, "loss": 1.178, "step": 1652 }, { "epoch": 0.12869446742250987, "grad_norm": 2.703519227063851, "learning_rate": 3.879821168713753e-05, "loss": 1.1286, "step": 1653 }, { "epoch": 0.12877232251471948, "grad_norm": 2.5210805836531796, "learning_rate": 3.8796506795988936e-05, "loss": 1.1293, "step": 1654 }, { "epoch": 0.1288501776069291, "grad_norm": 2.680079810552669, "learning_rate": 3.879480073390632e-05, "loss": 1.1254, "step": 1655 }, { "epoch": 0.12892803269913872, "grad_norm": 2.795478700092322, "learning_rate": 3.879309350099596e-05, "loss": 1.2359, "step": 1656 }, { "epoch": 0.12900588779134836, "grad_norm": 2.8576997476904964, "learning_rate": 3.8791385097364216e-05, "loss": 1.18, "step": 1657 }, { "epoch": 0.12908374288355798, "grad_norm": 2.6630015752828275, "learning_rate": 3.87896755231175e-05, "loss": 1.2192, "step": 1658 }, { "epoch": 0.1291615979757676, "grad_norm": 2.8372746043266224, "learning_rate": 3.878796477836232e-05, "loss": 1.2351, "step": 1659 }, { "epoch": 0.12923945306797724, "grad_norm": 2.65945176123944, "learning_rate": 3.878625286320525e-05, "loss": 1.1047, "step": 1660 }, { "epoch": 0.12931730816018686, "grad_norm": 2.9324225966595865, "learning_rate": 3.878453977775293e-05, "loss": 1.1854, "step": 1661 }, { "epoch": 0.12939516325239647, "grad_norm": 2.5954150038006736, "learning_rate": 3.878282552211207e-05, "loss": 1.1605, "step": 1662 }, { "epoch": 0.1294730183446061, "grad_norm": 2.6131346611754513, "learning_rate": 3.878111009638947e-05, "loss": 1.1369, "step": 1663 }, { "epoch": 0.12955087343681573, "grad_norm": 2.677626519384387, "learning_rate": 3.8779393500691994e-05, "loss": 1.1447, "step": 1664 }, { "epoch": 0.12962872852902535, "grad_norm": 2.74758479589868, "learning_rate": 3.877767573512658e-05, "loss": 1.2042, "step": 1665 }, { "epoch": 0.12970658362123497, "grad_norm": 2.7500584987772236, "learning_rate": 3.877595679980022e-05, "loss": 1.1305, "step": 1666 }, { "epoch": 0.1297844387134446, "grad_norm": 2.688266175406869, "learning_rate": 3.8774236694820006e-05, "loss": 1.1607, "step": 1667 }, { "epoch": 0.12986229380565423, "grad_norm": 2.5170573807814325, "learning_rate": 3.8772515420293096e-05, "loss": 1.1418, "step": 1668 }, { "epoch": 0.12994014889786384, "grad_norm": 2.6966574573524897, "learning_rate": 3.877079297632671e-05, "loss": 1.1286, "step": 1669 }, { "epoch": 0.1300180039900735, "grad_norm": 2.5884481197613516, "learning_rate": 3.876906936302815e-05, "loss": 1.1524, "step": 1670 }, { "epoch": 0.1300958590822831, "grad_norm": 2.966764539692452, "learning_rate": 3.8767344580504806e-05, "loss": 1.1492, "step": 1671 }, { "epoch": 0.13017371417449272, "grad_norm": 2.465729961566618, "learning_rate": 3.87656186288641e-05, "loss": 1.1339, "step": 1672 }, { "epoch": 0.13025156926670234, "grad_norm": 3.1081475463766224, "learning_rate": 3.876389150821355e-05, "loss": 1.2033, "step": 1673 }, { "epoch": 0.13032942435891198, "grad_norm": 2.6644159838045156, "learning_rate": 3.876216321866077e-05, "loss": 1.0961, "step": 1674 }, { "epoch": 0.1304072794511216, "grad_norm": 2.9938852844143318, "learning_rate": 3.876043376031341e-05, "loss": 1.1516, "step": 1675 }, { "epoch": 0.13048513454333122, "grad_norm": 2.8955358055132905, "learning_rate": 3.8758703133279205e-05, "loss": 1.2311, "step": 1676 }, { "epoch": 0.13056298963554086, "grad_norm": 2.8910491783998427, "learning_rate": 3.875697133766597e-05, "loss": 1.165, "step": 1677 }, { "epoch": 0.13064084472775048, "grad_norm": 2.6617541552050836, "learning_rate": 3.8755238373581586e-05, "loss": 1.1161, "step": 1678 }, { "epoch": 0.1307186998199601, "grad_norm": 2.8296697082589564, "learning_rate": 3.875350424113401e-05, "loss": 1.1302, "step": 1679 }, { "epoch": 0.13079655491216974, "grad_norm": 2.6922331949975713, "learning_rate": 3.875176894043126e-05, "loss": 1.1515, "step": 1680 }, { "epoch": 0.13087441000437935, "grad_norm": 2.5390249293024345, "learning_rate": 3.8750032471581454e-05, "loss": 1.091, "step": 1681 }, { "epoch": 0.13095226509658897, "grad_norm": 2.6645369069521427, "learning_rate": 3.8748294834692755e-05, "loss": 1.1374, "step": 1682 }, { "epoch": 0.1310301201887986, "grad_norm": 2.582525149332848, "learning_rate": 3.874655602987342e-05, "loss": 1.1373, "step": 1683 }, { "epoch": 0.13110797528100823, "grad_norm": 2.721690876886301, "learning_rate": 3.874481605723175e-05, "loss": 1.1659, "step": 1684 }, { "epoch": 0.13118583037321785, "grad_norm": 3.1951265948939978, "learning_rate": 3.874307491687616e-05, "loss": 1.1843, "step": 1685 }, { "epoch": 0.13126368546542747, "grad_norm": 2.709154241841868, "learning_rate": 3.8741332608915095e-05, "loss": 1.1852, "step": 1686 }, { "epoch": 0.1313415405576371, "grad_norm": 2.4840110007718383, "learning_rate": 3.87395891334571e-05, "loss": 1.1881, "step": 1687 }, { "epoch": 0.13141939564984673, "grad_norm": 2.550410553599177, "learning_rate": 3.873784449061079e-05, "loss": 1.1012, "step": 1688 }, { "epoch": 0.13149725074205634, "grad_norm": 2.6400284579122593, "learning_rate": 3.873609868048484e-05, "loss": 1.0717, "step": 1689 }, { "epoch": 0.13157510583426596, "grad_norm": 2.723053361171186, "learning_rate": 3.8734351703188015e-05, "loss": 1.142, "step": 1690 }, { "epoch": 0.1316529609264756, "grad_norm": 2.669267119597014, "learning_rate": 3.873260355882913e-05, "loss": 1.1014, "step": 1691 }, { "epoch": 0.13173081601868522, "grad_norm": 2.6167235116524776, "learning_rate": 3.87308542475171e-05, "loss": 1.1772, "step": 1692 }, { "epoch": 0.13180867111089484, "grad_norm": 2.7074797793576963, "learning_rate": 3.8729103769360897e-05, "loss": 1.1937, "step": 1693 }, { "epoch": 0.13188652620310448, "grad_norm": 2.630223664250492, "learning_rate": 3.872735212446956e-05, "loss": 1.11, "step": 1694 }, { "epoch": 0.1319643812953141, "grad_norm": 2.7958072456836054, "learning_rate": 3.8725599312952204e-05, "loss": 1.2072, "step": 1695 }, { "epoch": 0.13204223638752371, "grad_norm": 2.413594688919641, "learning_rate": 3.872384533491804e-05, "loss": 1.1104, "step": 1696 }, { "epoch": 0.13212009147973336, "grad_norm": 2.5930138739998645, "learning_rate": 3.872209019047632e-05, "loss": 1.1072, "step": 1697 }, { "epoch": 0.13219794657194298, "grad_norm": 2.6001915917553378, "learning_rate": 3.8720333879736374e-05, "loss": 1.1358, "step": 1698 }, { "epoch": 0.1322758016641526, "grad_norm": 2.5191757328909303, "learning_rate": 3.8718576402807636e-05, "loss": 1.1086, "step": 1699 }, { "epoch": 0.1323536567563622, "grad_norm": 2.6949592422024526, "learning_rate": 3.871681775979956e-05, "loss": 1.1197, "step": 1700 }, { "epoch": 0.1323536567563622, "eval_loss": 0.14142480492591858, "eval_runtime": 163.23, "eval_samples_per_second": 17.644, "eval_steps_per_second": 0.631, "step": 1700 }, { "epoch": 0.13243151184857185, "grad_norm": 2.5966576283735825, "learning_rate": 3.871505795082172e-05, "loss": 1.0565, "step": 1701 }, { "epoch": 0.13250936694078147, "grad_norm": 2.7400836584856014, "learning_rate": 3.8713296975983735e-05, "loss": 1.1647, "step": 1702 }, { "epoch": 0.1325872220329911, "grad_norm": 2.595116275088492, "learning_rate": 3.871153483539531e-05, "loss": 1.1544, "step": 1703 }, { "epoch": 0.13266507712520073, "grad_norm": 2.2834877250501715, "learning_rate": 3.8709771529166216e-05, "loss": 1.0271, "step": 1704 }, { "epoch": 0.13274293221741035, "grad_norm": 2.5587553778815786, "learning_rate": 3.87080070574063e-05, "loss": 1.087, "step": 1705 }, { "epoch": 0.13282078730961996, "grad_norm": 2.6390066218094432, "learning_rate": 3.8706241420225484e-05, "loss": 1.0758, "step": 1706 }, { "epoch": 0.13289864240182958, "grad_norm": 2.358212215617519, "learning_rate": 3.8704474617733746e-05, "loss": 1.0135, "step": 1707 }, { "epoch": 0.13297649749403923, "grad_norm": 2.6601054642895816, "learning_rate": 3.870270665004116e-05, "loss": 1.1741, "step": 1708 }, { "epoch": 0.13305435258624884, "grad_norm": 2.4591042740882507, "learning_rate": 3.870093751725787e-05, "loss": 1.1152, "step": 1709 }, { "epoch": 0.13313220767845846, "grad_norm": 2.503208209103838, "learning_rate": 3.869916721949406e-05, "loss": 1.1484, "step": 1710 }, { "epoch": 0.1332100627706681, "grad_norm": 2.645111673489355, "learning_rate": 3.8697395756860035e-05, "loss": 1.1, "step": 1711 }, { "epoch": 0.13328791786287772, "grad_norm": 2.639774020419496, "learning_rate": 3.869562312946614e-05, "loss": 1.1251, "step": 1712 }, { "epoch": 0.13336577295508734, "grad_norm": 2.5946438518017767, "learning_rate": 3.86938493374228e-05, "loss": 1.1452, "step": 1713 }, { "epoch": 0.13344362804729698, "grad_norm": 2.850498232920889, "learning_rate": 3.869207438084051e-05, "loss": 1.1198, "step": 1714 }, { "epoch": 0.1335214831395066, "grad_norm": 2.68344936608395, "learning_rate": 3.8690298259829856e-05, "loss": 1.1586, "step": 1715 }, { "epoch": 0.1335993382317162, "grad_norm": 2.611238109522018, "learning_rate": 3.868852097450147e-05, "loss": 1.0988, "step": 1716 }, { "epoch": 0.13367719332392583, "grad_norm": 2.591680218668776, "learning_rate": 3.868674252496607e-05, "loss": 1.1057, "step": 1717 }, { "epoch": 0.13375504841613547, "grad_norm": 2.5659528353947216, "learning_rate": 3.868496291133444e-05, "loss": 1.0575, "step": 1718 }, { "epoch": 0.1338329035083451, "grad_norm": 2.7875955913603487, "learning_rate": 3.868318213371746e-05, "loss": 1.1522, "step": 1719 }, { "epoch": 0.1339107586005547, "grad_norm": 2.606607360250423, "learning_rate": 3.868140019222604e-05, "loss": 1.1249, "step": 1720 }, { "epoch": 0.13398861369276435, "grad_norm": 2.4367707393729643, "learning_rate": 3.86796170869712e-05, "loss": 1.0898, "step": 1721 }, { "epoch": 0.13406646878497397, "grad_norm": 2.8110886056046533, "learning_rate": 3.8677832818064026e-05, "loss": 1.1772, "step": 1722 }, { "epoch": 0.13414432387718359, "grad_norm": 2.5789961109227177, "learning_rate": 3.8676047385615655e-05, "loss": 1.149, "step": 1723 }, { "epoch": 0.13422217896939323, "grad_norm": 2.5169570213980506, "learning_rate": 3.867426078973732e-05, "loss": 1.1947, "step": 1724 }, { "epoch": 0.13430003406160285, "grad_norm": 2.401785760973953, "learning_rate": 3.867247303054031e-05, "loss": 1.062, "step": 1725 }, { "epoch": 0.13437788915381246, "grad_norm": 2.4588336387673624, "learning_rate": 3.8670684108136e-05, "loss": 1.0829, "step": 1726 }, { "epoch": 0.13445574424602208, "grad_norm": 2.828178676012127, "learning_rate": 3.866889402263583e-05, "loss": 1.1425, "step": 1727 }, { "epoch": 0.13453359933823172, "grad_norm": 2.4318945486025294, "learning_rate": 3.866710277415131e-05, "loss": 1.0654, "step": 1728 }, { "epoch": 0.13461145443044134, "grad_norm": 2.5907947865289542, "learning_rate": 3.866531036279404e-05, "loss": 1.1483, "step": 1729 }, { "epoch": 0.13468930952265096, "grad_norm": 2.6468407018451687, "learning_rate": 3.866351678867567e-05, "loss": 1.0935, "step": 1730 }, { "epoch": 0.1347671646148606, "grad_norm": 2.6528463497890393, "learning_rate": 3.8661722051907924e-05, "loss": 1.2412, "step": 1731 }, { "epoch": 0.13484501970707022, "grad_norm": 2.8170426259797794, "learning_rate": 3.865992615260261e-05, "loss": 1.1013, "step": 1732 }, { "epoch": 0.13492287479927983, "grad_norm": 2.571072577059128, "learning_rate": 3.8658129090871607e-05, "loss": 1.0595, "step": 1733 }, { "epoch": 0.13500072989148945, "grad_norm": 2.729152450147115, "learning_rate": 3.865633086682687e-05, "loss": 1.1364, "step": 1734 }, { "epoch": 0.1350785849836991, "grad_norm": 2.911087584289117, "learning_rate": 3.8654531480580404e-05, "loss": 1.1684, "step": 1735 }, { "epoch": 0.1351564400759087, "grad_norm": 2.7804306939358843, "learning_rate": 3.865273093224431e-05, "loss": 1.147, "step": 1736 }, { "epoch": 0.13523429516811833, "grad_norm": 2.987065846051008, "learning_rate": 3.8650929221930756e-05, "loss": 1.1716, "step": 1737 }, { "epoch": 0.13531215026032797, "grad_norm": 2.4728335515222457, "learning_rate": 3.8649126349751986e-05, "loss": 1.0896, "step": 1738 }, { "epoch": 0.1353900053525376, "grad_norm": 2.6947609832327344, "learning_rate": 3.864732231582029e-05, "loss": 1.1637, "step": 1739 }, { "epoch": 0.1354678604447472, "grad_norm": 2.9139213185605413, "learning_rate": 3.864551712024808e-05, "loss": 1.0483, "step": 1740 }, { "epoch": 0.13554571553695685, "grad_norm": 2.7950839464942008, "learning_rate": 3.864371076314778e-05, "loss": 1.1588, "step": 1741 }, { "epoch": 0.13562357062916647, "grad_norm": 2.689486924982054, "learning_rate": 3.864190324463194e-05, "loss": 1.1583, "step": 1742 }, { "epoch": 0.13570142572137608, "grad_norm": 2.5479601066913675, "learning_rate": 3.8640094564813156e-05, "loss": 1.0616, "step": 1743 }, { "epoch": 0.1357792808135857, "grad_norm": 2.5905944065600517, "learning_rate": 3.8638284723804096e-05, "loss": 1.0398, "step": 1744 }, { "epoch": 0.13585713590579535, "grad_norm": 2.757622465561266, "learning_rate": 3.86364737217175e-05, "loss": 1.1087, "step": 1745 }, { "epoch": 0.13593499099800496, "grad_norm": 2.7234667795103045, "learning_rate": 3.8634661558666196e-05, "loss": 1.1202, "step": 1746 }, { "epoch": 0.13601284609021458, "grad_norm": 2.459259383914484, "learning_rate": 3.863284823476306e-05, "loss": 1.0944, "step": 1747 }, { "epoch": 0.13609070118242422, "grad_norm": 2.7200094228830403, "learning_rate": 3.863103375012106e-05, "loss": 1.1182, "step": 1748 }, { "epoch": 0.13616855627463384, "grad_norm": 2.6264132269124327, "learning_rate": 3.862921810485324e-05, "loss": 1.1461, "step": 1749 }, { "epoch": 0.13624641136684346, "grad_norm": 2.553058618242826, "learning_rate": 3.8627401299072695e-05, "loss": 1.119, "step": 1750 }, { "epoch": 0.13624641136684346, "eval_loss": 0.13808640837669373, "eval_runtime": 163.0382, "eval_samples_per_second": 17.665, "eval_steps_per_second": 0.632, "step": 1750 }, { "epoch": 0.1363242664590531, "grad_norm": 2.50418059464272, "learning_rate": 3.862558333289261e-05, "loss": 1.1165, "step": 1751 }, { "epoch": 0.13640212155126272, "grad_norm": 2.530186828963826, "learning_rate": 3.8623764206426214e-05, "loss": 1.0682, "step": 1752 }, { "epoch": 0.13647997664347233, "grad_norm": 2.5391151509183914, "learning_rate": 3.862194391978686e-05, "loss": 1.0675, "step": 1753 }, { "epoch": 0.13655783173568195, "grad_norm": 2.718950411197901, "learning_rate": 3.862012247308793e-05, "loss": 1.078, "step": 1754 }, { "epoch": 0.1366356868278916, "grad_norm": 2.5304779889987503, "learning_rate": 3.86182998664429e-05, "loss": 1.0529, "step": 1755 }, { "epoch": 0.1367135419201012, "grad_norm": 2.9061489727305476, "learning_rate": 3.861647609996529e-05, "loss": 1.1279, "step": 1756 }, { "epoch": 0.13679139701231083, "grad_norm": 2.592665236006953, "learning_rate": 3.861465117376872e-05, "loss": 1.1013, "step": 1757 }, { "epoch": 0.13686925210452047, "grad_norm": 2.6206130726324974, "learning_rate": 3.8612825087966885e-05, "loss": 1.0542, "step": 1758 }, { "epoch": 0.1369471071967301, "grad_norm": 2.5759029433121556, "learning_rate": 3.861099784267354e-05, "loss": 1.1142, "step": 1759 }, { "epoch": 0.1370249622889397, "grad_norm": 2.412740677771473, "learning_rate": 3.8609169438002504e-05, "loss": 1.042, "step": 1760 }, { "epoch": 0.13710281738114932, "grad_norm": 2.7589029820979327, "learning_rate": 3.860733987406768e-05, "loss": 1.1692, "step": 1761 }, { "epoch": 0.13718067247335897, "grad_norm": 2.5813589926010927, "learning_rate": 3.8605509150983046e-05, "loss": 1.0675, "step": 1762 }, { "epoch": 0.13725852756556858, "grad_norm": 2.692174685317511, "learning_rate": 3.860367726886265e-05, "loss": 1.1116, "step": 1763 }, { "epoch": 0.1373363826577782, "grad_norm": 2.6549464974201222, "learning_rate": 3.8601844227820605e-05, "loss": 1.083, "step": 1764 }, { "epoch": 0.13741423774998784, "grad_norm": 3.025745873219231, "learning_rate": 3.86000100279711e-05, "loss": 1.1658, "step": 1765 }, { "epoch": 0.13749209284219746, "grad_norm": 2.6923604800429985, "learning_rate": 3.8598174669428395e-05, "loss": 1.1581, "step": 1766 }, { "epoch": 0.13756994793440708, "grad_norm": 2.955462671849095, "learning_rate": 3.859633815230683e-05, "loss": 1.1099, "step": 1767 }, { "epoch": 0.13764780302661672, "grad_norm": 2.6081492966547746, "learning_rate": 3.8594500476720806e-05, "loss": 1.0714, "step": 1768 }, { "epoch": 0.13772565811882634, "grad_norm": 2.8258395200891617, "learning_rate": 3.859266164278481e-05, "loss": 1.0573, "step": 1769 }, { "epoch": 0.13780351321103596, "grad_norm": 2.655233590933311, "learning_rate": 3.859082165061338e-05, "loss": 1.0287, "step": 1770 }, { "epoch": 0.13788136830324557, "grad_norm": 2.835446758585377, "learning_rate": 3.858898050032115e-05, "loss": 1.0991, "step": 1771 }, { "epoch": 0.13795922339545522, "grad_norm": 2.912712125275151, "learning_rate": 3.85871381920228e-05, "loss": 1.1101, "step": 1772 }, { "epoch": 0.13803707848766483, "grad_norm": 2.5429879693023016, "learning_rate": 3.8585294725833125e-05, "loss": 1.0889, "step": 1773 }, { "epoch": 0.13811493357987445, "grad_norm": 2.566694636964866, "learning_rate": 3.858345010186694e-05, "loss": 1.0679, "step": 1774 }, { "epoch": 0.1381927886720841, "grad_norm": 2.4594421754956266, "learning_rate": 3.858160432023916e-05, "loss": 1.0816, "step": 1775 }, { "epoch": 0.1382706437642937, "grad_norm": 2.722833318363116, "learning_rate": 3.857975738106477e-05, "loss": 1.0619, "step": 1776 }, { "epoch": 0.13834849885650333, "grad_norm": 2.5884129897574066, "learning_rate": 3.857790928445884e-05, "loss": 1.1503, "step": 1777 }, { "epoch": 0.13842635394871297, "grad_norm": 2.6853201013167634, "learning_rate": 3.857606003053647e-05, "loss": 1.1291, "step": 1778 }, { "epoch": 0.1385042090409226, "grad_norm": 2.5327536110187903, "learning_rate": 3.8574209619412886e-05, "loss": 1.0589, "step": 1779 }, { "epoch": 0.1385820641331322, "grad_norm": 2.650150878092718, "learning_rate": 3.857235805120335e-05, "loss": 1.113, "step": 1780 }, { "epoch": 0.13865991922534182, "grad_norm": 2.345301657681663, "learning_rate": 3.85705053260232e-05, "loss": 1.019, "step": 1781 }, { "epoch": 0.13873777431755147, "grad_norm": 2.706776470518211, "learning_rate": 3.856865144398786e-05, "loss": 1.0808, "step": 1782 }, { "epoch": 0.13881562940976108, "grad_norm": 2.9276019026685423, "learning_rate": 3.856679640521281e-05, "loss": 1.1499, "step": 1783 }, { "epoch": 0.1388934845019707, "grad_norm": 2.6244754725812167, "learning_rate": 3.856494020981362e-05, "loss": 1.0815, "step": 1784 }, { "epoch": 0.13897133959418034, "grad_norm": 2.6746891018216714, "learning_rate": 3.856308285790592e-05, "loss": 1.0847, "step": 1785 }, { "epoch": 0.13904919468638996, "grad_norm": 2.681872020934163, "learning_rate": 3.856122434960541e-05, "loss": 1.0611, "step": 1786 }, { "epoch": 0.13912704977859958, "grad_norm": 2.640497447037235, "learning_rate": 3.8559364685027864e-05, "loss": 0.9784, "step": 1787 }, { "epoch": 0.1392049048708092, "grad_norm": 2.638914459119179, "learning_rate": 3.855750386428914e-05, "loss": 0.9921, "step": 1788 }, { "epoch": 0.13928275996301884, "grad_norm": 2.69245864424771, "learning_rate": 3.855564188750515e-05, "loss": 1.0828, "step": 1789 }, { "epoch": 0.13936061505522845, "grad_norm": 2.656587746960188, "learning_rate": 3.8553778754791896e-05, "loss": 0.9586, "step": 1790 }, { "epoch": 0.13943847014743807, "grad_norm": 2.8533379776321564, "learning_rate": 3.8551914466265424e-05, "loss": 1.1654, "step": 1791 }, { "epoch": 0.13951632523964771, "grad_norm": 2.8290194012709975, "learning_rate": 3.85500490220419e-05, "loss": 1.1478, "step": 1792 }, { "epoch": 0.13959418033185733, "grad_norm": 2.9035770500372315, "learning_rate": 3.8548182422237503e-05, "loss": 1.0881, "step": 1793 }, { "epoch": 0.13967203542406695, "grad_norm": 2.578186537119586, "learning_rate": 3.854631466696853e-05, "loss": 1.098, "step": 1794 }, { "epoch": 0.1397498905162766, "grad_norm": 2.859194946251911, "learning_rate": 3.854444575635132e-05, "loss": 1.0869, "step": 1795 }, { "epoch": 0.1398277456084862, "grad_norm": 2.6239016642084523, "learning_rate": 3.854257569050232e-05, "loss": 1.0997, "step": 1796 }, { "epoch": 0.13990560070069583, "grad_norm": 2.5753819725634046, "learning_rate": 3.8540704469537996e-05, "loss": 1.096, "step": 1797 }, { "epoch": 0.13998345579290544, "grad_norm": 2.586882882265456, "learning_rate": 3.853883209357495e-05, "loss": 1.1111, "step": 1798 }, { "epoch": 0.1400613108851151, "grad_norm": 2.438066894423169, "learning_rate": 3.853695856272979e-05, "loss": 1.0072, "step": 1799 }, { "epoch": 0.1401391659773247, "grad_norm": 2.650977343422784, "learning_rate": 3.853508387711925e-05, "loss": 1.0538, "step": 1800 }, { "epoch": 0.1401391659773247, "eval_loss": 0.13425160944461823, "eval_runtime": 162.5049, "eval_samples_per_second": 17.723, "eval_steps_per_second": 0.634, "step": 1800 }, { "epoch": 0.14021702106953432, "grad_norm": 2.7506017520338917, "learning_rate": 3.85332080368601e-05, "loss": 1.1116, "step": 1801 }, { "epoch": 0.14029487616174396, "grad_norm": 2.4011092077055918, "learning_rate": 3.853133104206921e-05, "loss": 1.0254, "step": 1802 }, { "epoch": 0.14037273125395358, "grad_norm": 2.5381149053769363, "learning_rate": 3.85294528928635e-05, "loss": 1.062, "step": 1803 }, { "epoch": 0.1404505863461632, "grad_norm": 2.5377262174483946, "learning_rate": 3.8527573589359975e-05, "loss": 1.0996, "step": 1804 }, { "epoch": 0.14052844143837284, "grad_norm": 2.5808198116871033, "learning_rate": 3.8525693131675693e-05, "loss": 1.0566, "step": 1805 }, { "epoch": 0.14060629653058246, "grad_norm": 2.6311842133039574, "learning_rate": 3.852381151992782e-05, "loss": 1.1248, "step": 1806 }, { "epoch": 0.14068415162279208, "grad_norm": 2.5466832884313018, "learning_rate": 3.8521928754233554e-05, "loss": 1.1039, "step": 1807 }, { "epoch": 0.1407620067150017, "grad_norm": 2.4216182140689444, "learning_rate": 3.852004483471018e-05, "loss": 1.0455, "step": 1808 }, { "epoch": 0.14083986180721134, "grad_norm": 2.6569627275646877, "learning_rate": 3.851815976147507e-05, "loss": 1.1032, "step": 1809 }, { "epoch": 0.14091771689942095, "grad_norm": 2.7064957811330537, "learning_rate": 3.851627353464565e-05, "loss": 1.1334, "step": 1810 }, { "epoch": 0.14099557199163057, "grad_norm": 2.416639274095709, "learning_rate": 3.8514386154339425e-05, "loss": 1.0217, "step": 1811 }, { "epoch": 0.1410734270838402, "grad_norm": 2.535232584081124, "learning_rate": 3.851249762067396e-05, "loss": 1.0006, "step": 1812 }, { "epoch": 0.14115128217604983, "grad_norm": 2.324928305514461, "learning_rate": 3.8510607933766915e-05, "loss": 0.9577, "step": 1813 }, { "epoch": 0.14122913726825945, "grad_norm": 3.014058565560181, "learning_rate": 3.8508717093736006e-05, "loss": 1.1117, "step": 1814 }, { "epoch": 0.14130699236046906, "grad_norm": 2.815348388925798, "learning_rate": 3.8506825100699014e-05, "loss": 1.0282, "step": 1815 }, { "epoch": 0.1413848474526787, "grad_norm": 2.6989429230295596, "learning_rate": 3.850493195477381e-05, "loss": 1.0362, "step": 1816 }, { "epoch": 0.14146270254488832, "grad_norm": 2.6587147768623347, "learning_rate": 3.8503037656078336e-05, "loss": 1.0344, "step": 1817 }, { "epoch": 0.14154055763709794, "grad_norm": 3.157451515648853, "learning_rate": 3.850114220473057e-05, "loss": 1.1613, "step": 1818 }, { "epoch": 0.14161841272930759, "grad_norm": 2.462602343804255, "learning_rate": 3.8499245600848624e-05, "loss": 1.0569, "step": 1819 }, { "epoch": 0.1416962678215172, "grad_norm": 2.717684309899047, "learning_rate": 3.849734784455063e-05, "loss": 1.0518, "step": 1820 }, { "epoch": 0.14177412291372682, "grad_norm": 2.9774406532866147, "learning_rate": 3.849544893595481e-05, "loss": 1.1165, "step": 1821 }, { "epoch": 0.14185197800593646, "grad_norm": 2.3829970440526664, "learning_rate": 3.849354887517945e-05, "loss": 1.0028, "step": 1822 }, { "epoch": 0.14192983309814608, "grad_norm": 2.9248114877604414, "learning_rate": 3.849164766234293e-05, "loss": 1.0479, "step": 1823 }, { "epoch": 0.1420076881903557, "grad_norm": 2.670136286050623, "learning_rate": 3.8489745297563665e-05, "loss": 1.0521, "step": 1824 }, { "epoch": 0.1420855432825653, "grad_norm": 2.8231915169313475, "learning_rate": 3.848784178096019e-05, "loss": 1.0562, "step": 1825 }, { "epoch": 0.14216339837477496, "grad_norm": 2.7100203208360183, "learning_rate": 3.848593711265107e-05, "loss": 1.1196, "step": 1826 }, { "epoch": 0.14224125346698457, "grad_norm": 3.155940250751872, "learning_rate": 3.848403129275496e-05, "loss": 1.0891, "step": 1827 }, { "epoch": 0.1423191085591942, "grad_norm": 2.5141576846625284, "learning_rate": 3.848212432139058e-05, "loss": 1.061, "step": 1828 }, { "epoch": 0.14239696365140384, "grad_norm": 2.8673506948865235, "learning_rate": 3.848021619867673e-05, "loss": 0.9985, "step": 1829 }, { "epoch": 0.14247481874361345, "grad_norm": 2.6591849963262173, "learning_rate": 3.847830692473228e-05, "loss": 1.0978, "step": 1830 }, { "epoch": 0.14255267383582307, "grad_norm": 3.3434877866364854, "learning_rate": 3.847639649967616e-05, "loss": 1.0904, "step": 1831 }, { "epoch": 0.1426305289280327, "grad_norm": 2.7473903537392568, "learning_rate": 3.847448492362739e-05, "loss": 1.0956, "step": 1832 }, { "epoch": 0.14270838402024233, "grad_norm": 2.8956079081209083, "learning_rate": 3.8472572196705037e-05, "loss": 1.0447, "step": 1833 }, { "epoch": 0.14278623911245195, "grad_norm": 2.7272025626071947, "learning_rate": 3.847065831902827e-05, "loss": 1.0404, "step": 1834 }, { "epoch": 0.14286409420466156, "grad_norm": 2.994231867663282, "learning_rate": 3.84687432907163e-05, "loss": 1.0526, "step": 1835 }, { "epoch": 0.1429419492968712, "grad_norm": 2.63219992846321, "learning_rate": 3.846682711188845e-05, "loss": 0.94, "step": 1836 }, { "epoch": 0.14301980438908082, "grad_norm": 3.0660510489140167, "learning_rate": 3.8464909782664064e-05, "loss": 1.1248, "step": 1837 }, { "epoch": 0.14309765948129044, "grad_norm": 2.8100002445517727, "learning_rate": 3.8462991303162595e-05, "loss": 1.0548, "step": 1838 }, { "epoch": 0.14317551457350008, "grad_norm": 2.8079175574194495, "learning_rate": 3.8461071673503554e-05, "loss": 1.0456, "step": 1839 }, { "epoch": 0.1432533696657097, "grad_norm": 2.8798477907943743, "learning_rate": 3.8459150893806516e-05, "loss": 1.1097, "step": 1840 }, { "epoch": 0.14333122475791932, "grad_norm": 2.5348591377949603, "learning_rate": 3.845722896419115e-05, "loss": 1.0052, "step": 1841 }, { "epoch": 0.14340907985012893, "grad_norm": 2.4854212737708266, "learning_rate": 3.845530588477717e-05, "loss": 1.0593, "step": 1842 }, { "epoch": 0.14348693494233858, "grad_norm": 2.4314069612670854, "learning_rate": 3.845338165568438e-05, "loss": 0.9947, "step": 1843 }, { "epoch": 0.1435647900345482, "grad_norm": 2.4248966590213765, "learning_rate": 3.845145627703265e-05, "loss": 1.0001, "step": 1844 }, { "epoch": 0.1436426451267578, "grad_norm": 2.387714508693027, "learning_rate": 3.844952974894193e-05, "loss": 1.058, "step": 1845 }, { "epoch": 0.14372050021896746, "grad_norm": 2.517667498227051, "learning_rate": 3.8447602071532224e-05, "loss": 0.9466, "step": 1846 }, { "epoch": 0.14379835531117707, "grad_norm": 2.45350890474074, "learning_rate": 3.844567324492362e-05, "loss": 1.0257, "step": 1847 }, { "epoch": 0.1438762104033867, "grad_norm": 2.715875597175445, "learning_rate": 3.844374326923628e-05, "loss": 1.0877, "step": 1848 }, { "epoch": 0.14395406549559633, "grad_norm": 2.6742581172701883, "learning_rate": 3.8441812144590416e-05, "loss": 1.055, "step": 1849 }, { "epoch": 0.14403192058780595, "grad_norm": 2.642966453961496, "learning_rate": 3.843987987110635e-05, "loss": 1.0358, "step": 1850 }, { "epoch": 0.14403192058780595, "eval_loss": 0.12939216196537018, "eval_runtime": 165.7823, "eval_samples_per_second": 17.372, "eval_steps_per_second": 0.621, "step": 1850 }, { "epoch": 0.14410977568001557, "grad_norm": 2.477044642231272, "learning_rate": 3.843794644890444e-05, "loss": 0.9759, "step": 1851 }, { "epoch": 0.14418763077222518, "grad_norm": 2.5626146070418074, "learning_rate": 3.843601187810513e-05, "loss": 1.0152, "step": 1852 }, { "epoch": 0.14426548586443483, "grad_norm": 2.827144285426966, "learning_rate": 3.843407615882894e-05, "loss": 1.0553, "step": 1853 }, { "epoch": 0.14434334095664444, "grad_norm": 2.6954749047644633, "learning_rate": 3.843213929119646e-05, "loss": 1.0406, "step": 1854 }, { "epoch": 0.14442119604885406, "grad_norm": 2.682498266968707, "learning_rate": 3.843020127532833e-05, "loss": 1.0437, "step": 1855 }, { "epoch": 0.1444990511410637, "grad_norm": 2.504772015815664, "learning_rate": 3.8428262111345295e-05, "loss": 1.0076, "step": 1856 }, { "epoch": 0.14457690623327332, "grad_norm": 2.9595192975850364, "learning_rate": 3.8426321799368146e-05, "loss": 1.1572, "step": 1857 }, { "epoch": 0.14465476132548294, "grad_norm": 2.7768905518852476, "learning_rate": 3.8424380339517764e-05, "loss": 1.0319, "step": 1858 }, { "epoch": 0.14473261641769258, "grad_norm": 2.5332252297020688, "learning_rate": 3.8422437731915086e-05, "loss": 1.026, "step": 1859 }, { "epoch": 0.1448104715099022, "grad_norm": 2.5458515553959677, "learning_rate": 3.8420493976681135e-05, "loss": 0.9996, "step": 1860 }, { "epoch": 0.14488832660211182, "grad_norm": 2.6072940203944173, "learning_rate": 3.841854907393699e-05, "loss": 1.092, "step": 1861 }, { "epoch": 0.14496618169432143, "grad_norm": 2.4224597612600935, "learning_rate": 3.841660302380382e-05, "loss": 0.971, "step": 1862 }, { "epoch": 0.14504403678653108, "grad_norm": 2.540948120065282, "learning_rate": 3.841465582640284e-05, "loss": 1.0286, "step": 1863 }, { "epoch": 0.1451218918787407, "grad_norm": 2.820469774385746, "learning_rate": 3.841270748185536e-05, "loss": 1.0659, "step": 1864 }, { "epoch": 0.1451997469709503, "grad_norm": 2.729212825249767, "learning_rate": 3.841075799028275e-05, "loss": 1.0778, "step": 1865 }, { "epoch": 0.14527760206315996, "grad_norm": 2.473848218684871, "learning_rate": 3.840880735180646e-05, "loss": 0.9904, "step": 1866 }, { "epoch": 0.14535545715536957, "grad_norm": 2.496790422583391, "learning_rate": 3.840685556654799e-05, "loss": 0.9727, "step": 1867 }, { "epoch": 0.1454333122475792, "grad_norm": 2.529723749112431, "learning_rate": 3.840490263462895e-05, "loss": 0.9388, "step": 1868 }, { "epoch": 0.1455111673397888, "grad_norm": 2.505932390218396, "learning_rate": 3.840294855617098e-05, "loss": 1.0016, "step": 1869 }, { "epoch": 0.14558902243199845, "grad_norm": 2.657111425069313, "learning_rate": 3.840099333129582e-05, "loss": 1.0351, "step": 1870 }, { "epoch": 0.14566687752420807, "grad_norm": 2.5700663074428176, "learning_rate": 3.8399036960125264e-05, "loss": 1.0233, "step": 1871 }, { "epoch": 0.14574473261641768, "grad_norm": 2.5849254963144617, "learning_rate": 3.839707944278119e-05, "loss": 1.0221, "step": 1872 }, { "epoch": 0.14582258770862733, "grad_norm": 2.440024557169397, "learning_rate": 3.8395120779385544e-05, "loss": 1.0396, "step": 1873 }, { "epoch": 0.14590044280083694, "grad_norm": 2.483830362057101, "learning_rate": 3.839316097006033e-05, "loss": 1.0107, "step": 1874 }, { "epoch": 0.14597829789304656, "grad_norm": 2.459353574372447, "learning_rate": 3.8391200014927654e-05, "loss": 1.0109, "step": 1875 }, { "epoch": 0.1460561529852562, "grad_norm": 2.3912593824516764, "learning_rate": 3.838923791410965e-05, "loss": 1.0349, "step": 1876 }, { "epoch": 0.14613400807746582, "grad_norm": 2.640523535973332, "learning_rate": 3.8387274667728574e-05, "loss": 1.0514, "step": 1877 }, { "epoch": 0.14621186316967544, "grad_norm": 2.792853114006588, "learning_rate": 3.838531027590671e-05, "loss": 1.0782, "step": 1878 }, { "epoch": 0.14628971826188505, "grad_norm": 2.2890035628003975, "learning_rate": 3.838334473876643e-05, "loss": 0.9473, "step": 1879 }, { "epoch": 0.1463675733540947, "grad_norm": 2.711855883279045, "learning_rate": 3.838137805643019e-05, "loss": 1.0525, "step": 1880 }, { "epoch": 0.14644542844630432, "grad_norm": 2.420385373003801, "learning_rate": 3.837941022902049e-05, "loss": 1.0157, "step": 1881 }, { "epoch": 0.14652328353851393, "grad_norm": 2.59535048389754, "learning_rate": 3.837744125665992e-05, "loss": 1.0612, "step": 1882 }, { "epoch": 0.14660113863072358, "grad_norm": 2.821070233002958, "learning_rate": 3.837547113947116e-05, "loss": 1.1577, "step": 1883 }, { "epoch": 0.1466789937229332, "grad_norm": 2.5306067097707317, "learning_rate": 3.83734998775769e-05, "loss": 0.9847, "step": 1884 }, { "epoch": 0.1467568488151428, "grad_norm": 2.410761467125738, "learning_rate": 3.837152747109997e-05, "loss": 1.024, "step": 1885 }, { "epoch": 0.14683470390735245, "grad_norm": 2.595827495088229, "learning_rate": 3.8369553920163235e-05, "loss": 1.0382, "step": 1886 }, { "epoch": 0.14691255899956207, "grad_norm": 2.4904971980084865, "learning_rate": 3.836757922488963e-05, "loss": 1.0213, "step": 1887 }, { "epoch": 0.1469904140917717, "grad_norm": 2.5245691297703865, "learning_rate": 3.836560338540217e-05, "loss": 1.0372, "step": 1888 }, { "epoch": 0.1470682691839813, "grad_norm": 2.4334877493940845, "learning_rate": 3.836362640182396e-05, "loss": 0.9641, "step": 1889 }, { "epoch": 0.14714612427619095, "grad_norm": 2.4532909248568466, "learning_rate": 3.836164827427813e-05, "loss": 0.9512, "step": 1890 }, { "epoch": 0.14722397936840056, "grad_norm": 2.549634029152512, "learning_rate": 3.835966900288792e-05, "loss": 1.047, "step": 1891 }, { "epoch": 0.14730183446061018, "grad_norm": 2.5550778789833615, "learning_rate": 3.8357688587776634e-05, "loss": 1.1203, "step": 1892 }, { "epoch": 0.14737968955281983, "grad_norm": 2.4777207741877763, "learning_rate": 3.835570702906763e-05, "loss": 1.0522, "step": 1893 }, { "epoch": 0.14745754464502944, "grad_norm": 2.538594275041004, "learning_rate": 3.8353724326884364e-05, "loss": 1.0751, "step": 1894 }, { "epoch": 0.14753539973723906, "grad_norm": 2.5034152580607474, "learning_rate": 3.835174048135034e-05, "loss": 0.9873, "step": 1895 }, { "epoch": 0.14761325482944868, "grad_norm": 2.347757161628829, "learning_rate": 3.834975549258915e-05, "loss": 0.9438, "step": 1896 }, { "epoch": 0.14769110992165832, "grad_norm": 2.5254737773551437, "learning_rate": 3.834776936072444e-05, "loss": 1.0247, "step": 1897 }, { "epoch": 0.14776896501386794, "grad_norm": 2.708396426258635, "learning_rate": 3.834578208587994e-05, "loss": 0.9723, "step": 1898 }, { "epoch": 0.14784682010607755, "grad_norm": 2.4123090054809015, "learning_rate": 3.834379366817945e-05, "loss": 0.937, "step": 1899 }, { "epoch": 0.1479246751982872, "grad_norm": 2.531679664221714, "learning_rate": 3.8341804107746837e-05, "loss": 0.9818, "step": 1900 }, { "epoch": 0.1479246751982872, "eval_loss": 0.12720607221126556, "eval_runtime": 162.3607, "eval_samples_per_second": 17.738, "eval_steps_per_second": 0.634, "step": 1900 }, { "epoch": 0.14800253029049681, "grad_norm": 2.538467048102559, "learning_rate": 3.8339813404706044e-05, "loss": 0.9704, "step": 1901 }, { "epoch": 0.14808038538270643, "grad_norm": 2.5907664982341236, "learning_rate": 3.833782155918108e-05, "loss": 1.0289, "step": 1902 }, { "epoch": 0.14815824047491608, "grad_norm": 2.4606885082923196, "learning_rate": 3.833582857129603e-05, "loss": 1.0128, "step": 1903 }, { "epoch": 0.1482360955671257, "grad_norm": 2.4702766841283976, "learning_rate": 3.8333834441175046e-05, "loss": 1.0166, "step": 1904 }, { "epoch": 0.1483139506593353, "grad_norm": 2.5153268431300244, "learning_rate": 3.833183916894235e-05, "loss": 1.0084, "step": 1905 }, { "epoch": 0.14839180575154493, "grad_norm": 2.573289853926925, "learning_rate": 3.832984275472225e-05, "loss": 0.9821, "step": 1906 }, { "epoch": 0.14846966084375457, "grad_norm": 2.360350562221416, "learning_rate": 3.832784519863909e-05, "loss": 0.9819, "step": 1907 }, { "epoch": 0.1485475159359642, "grad_norm": 2.3597188668409848, "learning_rate": 3.832584650081733e-05, "loss": 0.9705, "step": 1908 }, { "epoch": 0.1486253710281738, "grad_norm": 2.7358400427425864, "learning_rate": 3.832384666138147e-05, "loss": 1.0558, "step": 1909 }, { "epoch": 0.14870322612038345, "grad_norm": 2.402189271868872, "learning_rate": 3.83218456804561e-05, "loss": 0.9856, "step": 1910 }, { "epoch": 0.14878108121259306, "grad_norm": 2.4901380358721132, "learning_rate": 3.8319843558165855e-05, "loss": 0.9982, "step": 1911 }, { "epoch": 0.14885893630480268, "grad_norm": 2.463834704009947, "learning_rate": 3.831784029463547e-05, "loss": 0.9967, "step": 1912 }, { "epoch": 0.1489367913970123, "grad_norm": 2.587952174099362, "learning_rate": 3.8315835889989734e-05, "loss": 0.9727, "step": 1913 }, { "epoch": 0.14901464648922194, "grad_norm": 2.471081790156677, "learning_rate": 3.831383034435352e-05, "loss": 0.9637, "step": 1914 }, { "epoch": 0.14909250158143156, "grad_norm": 2.4709621945747107, "learning_rate": 3.8311823657851755e-05, "loss": 0.9856, "step": 1915 }, { "epoch": 0.14917035667364117, "grad_norm": 2.5498365359041246, "learning_rate": 3.830981583060945e-05, "loss": 0.9971, "step": 1916 }, { "epoch": 0.14924821176585082, "grad_norm": 2.577493712781621, "learning_rate": 3.830780686275168e-05, "loss": 1.0412, "step": 1917 }, { "epoch": 0.14932606685806044, "grad_norm": 2.3060996363899453, "learning_rate": 3.8305796754403604e-05, "loss": 0.9086, "step": 1918 }, { "epoch": 0.14940392195027005, "grad_norm": 2.351472932748611, "learning_rate": 3.830378550569043e-05, "loss": 0.9881, "step": 1919 }, { "epoch": 0.1494817770424797, "grad_norm": 2.5036121150449206, "learning_rate": 3.830177311673745e-05, "loss": 0.9944, "step": 1920 }, { "epoch": 0.1495596321346893, "grad_norm": 2.409991992151651, "learning_rate": 3.829975958767003e-05, "loss": 0.9422, "step": 1921 }, { "epoch": 0.14963748722689893, "grad_norm": 2.5026577638000544, "learning_rate": 3.829774491861361e-05, "loss": 1.0561, "step": 1922 }, { "epoch": 0.14971534231910855, "grad_norm": 2.5224990663143445, "learning_rate": 3.8295729109693687e-05, "loss": 1.0495, "step": 1923 }, { "epoch": 0.1497931974113182, "grad_norm": 2.381994033985152, "learning_rate": 3.8293712161035846e-05, "loss": 1.0032, "step": 1924 }, { "epoch": 0.1498710525035278, "grad_norm": 2.5121042963715983, "learning_rate": 3.8291694072765715e-05, "loss": 0.9947, "step": 1925 }, { "epoch": 0.14994890759573742, "grad_norm": 2.681758365619551, "learning_rate": 3.828967484500902e-05, "loss": 1.0268, "step": 1926 }, { "epoch": 0.15002676268794707, "grad_norm": 2.502222926127211, "learning_rate": 3.828765447789156e-05, "loss": 0.9432, "step": 1927 }, { "epoch": 0.15010461778015668, "grad_norm": 2.4079324323476907, "learning_rate": 3.828563297153918e-05, "loss": 0.9747, "step": 1928 }, { "epoch": 0.1501824728723663, "grad_norm": 2.5917920618785457, "learning_rate": 3.8283610326077816e-05, "loss": 0.9768, "step": 1929 }, { "epoch": 0.15026032796457595, "grad_norm": 2.6090398521226392, "learning_rate": 3.828158654163347e-05, "loss": 1.0628, "step": 1930 }, { "epoch": 0.15033818305678556, "grad_norm": 2.369729146843658, "learning_rate": 3.8279561618332214e-05, "loss": 0.9269, "step": 1931 }, { "epoch": 0.15041603814899518, "grad_norm": 2.3365055490026374, "learning_rate": 3.8277535556300194e-05, "loss": 0.925, "step": 1932 }, { "epoch": 0.1504938932412048, "grad_norm": 2.48205871115025, "learning_rate": 3.827550835566362e-05, "loss": 0.9572, "step": 1933 }, { "epoch": 0.15057174833341444, "grad_norm": 2.4486729899637694, "learning_rate": 3.827348001654877e-05, "loss": 0.9977, "step": 1934 }, { "epoch": 0.15064960342562406, "grad_norm": 2.5014255005086006, "learning_rate": 3.8271450539082014e-05, "loss": 1.0225, "step": 1935 }, { "epoch": 0.15072745851783367, "grad_norm": 2.6050839289743157, "learning_rate": 3.826941992338977e-05, "loss": 0.9635, "step": 1936 }, { "epoch": 0.15080531361004332, "grad_norm": 2.439967695857665, "learning_rate": 3.826738816959855e-05, "loss": 0.9715, "step": 1937 }, { "epoch": 0.15088316870225293, "grad_norm": 2.4189017747940214, "learning_rate": 3.82653552778349e-05, "loss": 0.9726, "step": 1938 }, { "epoch": 0.15096102379446255, "grad_norm": 2.5152906504700443, "learning_rate": 3.826332124822547e-05, "loss": 0.9962, "step": 1939 }, { "epoch": 0.15103887888667217, "grad_norm": 2.5966660545476317, "learning_rate": 3.826128608089698e-05, "loss": 0.9537, "step": 1940 }, { "epoch": 0.1511167339788818, "grad_norm": 2.788089850897013, "learning_rate": 3.8259249775976196e-05, "loss": 1.0116, "step": 1941 }, { "epoch": 0.15119458907109143, "grad_norm": 2.4234228268083684, "learning_rate": 3.825721233358998e-05, "loss": 0.9897, "step": 1942 }, { "epoch": 0.15127244416330105, "grad_norm": 2.5299944688692957, "learning_rate": 3.8255173753865244e-05, "loss": 1.0289, "step": 1943 }, { "epoch": 0.1513502992555107, "grad_norm": 2.5100899873865345, "learning_rate": 3.8253134036929e-05, "loss": 0.9794, "step": 1944 }, { "epoch": 0.1514281543477203, "grad_norm": 2.6079769375241733, "learning_rate": 3.82510931829083e-05, "loss": 1.0395, "step": 1945 }, { "epoch": 0.15150600943992992, "grad_norm": 2.7187892167269565, "learning_rate": 3.824905119193028e-05, "loss": 1.0277, "step": 1946 }, { "epoch": 0.15158386453213957, "grad_norm": 2.470404337567538, "learning_rate": 3.8247008064122146e-05, "loss": 0.9924, "step": 1947 }, { "epoch": 0.15166171962434918, "grad_norm": 2.4768152557842154, "learning_rate": 3.824496379961119e-05, "loss": 0.9852, "step": 1948 }, { "epoch": 0.1517395747165588, "grad_norm": 2.3954738290830635, "learning_rate": 3.8242918398524735e-05, "loss": 0.9747, "step": 1949 }, { "epoch": 0.15181742980876842, "grad_norm": 2.4440613276208905, "learning_rate": 3.824087186099022e-05, "loss": 0.9644, "step": 1950 }, { "epoch": 0.15181742980876842, "eval_loss": 0.12355648726224899, "eval_runtime": 162.6914, "eval_samples_per_second": 17.702, "eval_steps_per_second": 0.633, "step": 1950 }, { "epoch": 0.15189528490097806, "grad_norm": 2.662102308973982, "learning_rate": 3.823882418713513e-05, "loss": 1.0069, "step": 1951 }, { "epoch": 0.15197313999318768, "grad_norm": 2.626951740588321, "learning_rate": 3.823677537708702e-05, "loss": 0.9655, "step": 1952 }, { "epoch": 0.1520509950853973, "grad_norm": 2.8063641275741706, "learning_rate": 3.823472543097352e-05, "loss": 1.0185, "step": 1953 }, { "epoch": 0.15212885017760694, "grad_norm": 2.3420419833275266, "learning_rate": 3.823267434892234e-05, "loss": 0.9405, "step": 1954 }, { "epoch": 0.15220670526981656, "grad_norm": 2.573955716058004, "learning_rate": 3.823062213106125e-05, "loss": 0.9911, "step": 1955 }, { "epoch": 0.15228456036202617, "grad_norm": 2.474891373124564, "learning_rate": 3.822856877751809e-05, "loss": 0.9379, "step": 1956 }, { "epoch": 0.15236241545423582, "grad_norm": 2.5266690491072645, "learning_rate": 3.822651428842078e-05, "loss": 1.0087, "step": 1957 }, { "epoch": 0.15244027054644543, "grad_norm": 2.4343734605093226, "learning_rate": 3.822445866389731e-05, "loss": 1.0088, "step": 1958 }, { "epoch": 0.15251812563865505, "grad_norm": 2.7354586812653934, "learning_rate": 3.8222401904075715e-05, "loss": 1.0423, "step": 1959 }, { "epoch": 0.15259598073086467, "grad_norm": 2.449502321553864, "learning_rate": 3.822034400908414e-05, "loss": 0.9328, "step": 1960 }, { "epoch": 0.1526738358230743, "grad_norm": 2.388912782675011, "learning_rate": 3.821828497905077e-05, "loss": 0.9483, "step": 1961 }, { "epoch": 0.15275169091528393, "grad_norm": 2.8937003385276796, "learning_rate": 3.8216224814103885e-05, "loss": 1.0869, "step": 1962 }, { "epoch": 0.15282954600749354, "grad_norm": 2.508688914355508, "learning_rate": 3.821416351437182e-05, "loss": 0.9931, "step": 1963 }, { "epoch": 0.1529074010997032, "grad_norm": 2.5931374482177167, "learning_rate": 3.8212101079982986e-05, "loss": 0.9919, "step": 1964 }, { "epoch": 0.1529852561919128, "grad_norm": 2.506683165713034, "learning_rate": 3.821003751106585e-05, "loss": 0.9892, "step": 1965 }, { "epoch": 0.15306311128412242, "grad_norm": 2.40669044140197, "learning_rate": 3.820797280774897e-05, "loss": 0.9021, "step": 1966 }, { "epoch": 0.15314096637633204, "grad_norm": 2.4182759725678484, "learning_rate": 3.820590697016098e-05, "loss": 0.9523, "step": 1967 }, { "epoch": 0.15321882146854168, "grad_norm": 2.53196920871066, "learning_rate": 3.8203839998430554e-05, "loss": 0.9952, "step": 1968 }, { "epoch": 0.1532966765607513, "grad_norm": 2.4666896331592145, "learning_rate": 3.8201771892686456e-05, "loss": 0.9762, "step": 1969 }, { "epoch": 0.15337453165296092, "grad_norm": 2.5055721261240014, "learning_rate": 3.8199702653057535e-05, "loss": 0.9932, "step": 1970 }, { "epoch": 0.15345238674517056, "grad_norm": 2.8152921656571173, "learning_rate": 3.8197632279672684e-05, "loss": 1.0854, "step": 1971 }, { "epoch": 0.15353024183738018, "grad_norm": 2.4187181985723694, "learning_rate": 3.8195560772660874e-05, "loss": 0.9933, "step": 1972 }, { "epoch": 0.1536080969295898, "grad_norm": 2.398187221459603, "learning_rate": 3.819348813215116e-05, "loss": 0.9664, "step": 1973 }, { "epoch": 0.15368595202179944, "grad_norm": 2.5705214791627182, "learning_rate": 3.819141435827264e-05, "loss": 0.9529, "step": 1974 }, { "epoch": 0.15376380711400905, "grad_norm": 2.7622375806106887, "learning_rate": 3.8189339451154526e-05, "loss": 1.0129, "step": 1975 }, { "epoch": 0.15384166220621867, "grad_norm": 2.5006674549563757, "learning_rate": 3.818726341092606e-05, "loss": 0.9259, "step": 1976 }, { "epoch": 0.1539195172984283, "grad_norm": 2.608456893864879, "learning_rate": 3.818518623771657e-05, "loss": 1.0188, "step": 1977 }, { "epoch": 0.15399737239063793, "grad_norm": 2.640291867459224, "learning_rate": 3.8183107931655445e-05, "loss": 0.9606, "step": 1978 }, { "epoch": 0.15407522748284755, "grad_norm": 2.4355133731120504, "learning_rate": 3.818102849287217e-05, "loss": 0.9224, "step": 1979 }, { "epoch": 0.15415308257505717, "grad_norm": 2.703556659898224, "learning_rate": 3.817894792149629e-05, "loss": 1.0229, "step": 1980 }, { "epoch": 0.1542309376672668, "grad_norm": 2.3217494216313748, "learning_rate": 3.817686621765739e-05, "loss": 0.9029, "step": 1981 }, { "epoch": 0.15430879275947643, "grad_norm": 2.6464080366869096, "learning_rate": 3.817478338148517e-05, "loss": 1.0179, "step": 1982 }, { "epoch": 0.15438664785168604, "grad_norm": 2.6451705490565565, "learning_rate": 3.8172699413109365e-05, "loss": 1.0028, "step": 1983 }, { "epoch": 0.1544645029438957, "grad_norm": 2.4998831622243807, "learning_rate": 3.817061431265981e-05, "loss": 0.912, "step": 1984 }, { "epoch": 0.1545423580361053, "grad_norm": 2.554170833862199, "learning_rate": 3.81685280802664e-05, "loss": 1.006, "step": 1985 }, { "epoch": 0.15462021312831492, "grad_norm": 2.5047942292354204, "learning_rate": 3.8166440716059086e-05, "loss": 0.9876, "step": 1986 }, { "epoch": 0.15469806822052454, "grad_norm": 2.3917992158226458, "learning_rate": 3.8164352220167904e-05, "loss": 0.9738, "step": 1987 }, { "epoch": 0.15477592331273418, "grad_norm": 2.4309123022324703, "learning_rate": 3.816226259272296e-05, "loss": 0.9434, "step": 1988 }, { "epoch": 0.1548537784049438, "grad_norm": 2.5373117927291173, "learning_rate": 3.8160171833854424e-05, "loss": 0.95, "step": 1989 }, { "epoch": 0.15493163349715341, "grad_norm": 2.6039707091323017, "learning_rate": 3.8158079943692545e-05, "loss": 0.953, "step": 1990 }, { "epoch": 0.15500948858936306, "grad_norm": 2.548774414083268, "learning_rate": 3.815598692236764e-05, "loss": 0.9241, "step": 1991 }, { "epoch": 0.15508734368157268, "grad_norm": 2.6278486019339304, "learning_rate": 3.8153892770010085e-05, "loss": 0.966, "step": 1992 }, { "epoch": 0.1551651987737823, "grad_norm": 2.6898400103207702, "learning_rate": 3.815179748675034e-05, "loss": 1.0208, "step": 1993 }, { "epoch": 0.1552430538659919, "grad_norm": 2.786841417887521, "learning_rate": 3.814970107271894e-05, "loss": 0.9653, "step": 1994 }, { "epoch": 0.15532090895820155, "grad_norm": 2.3668971907795378, "learning_rate": 3.814760352804647e-05, "loss": 0.9309, "step": 1995 }, { "epoch": 0.15539876405041117, "grad_norm": 2.2778913855354763, "learning_rate": 3.81455048528636e-05, "loss": 0.9873, "step": 1996 }, { "epoch": 0.1554766191426208, "grad_norm": 2.553917653331978, "learning_rate": 3.814340504730106e-05, "loss": 0.9165, "step": 1997 }, { "epoch": 0.15555447423483043, "grad_norm": 2.5323869762117455, "learning_rate": 3.814130411148968e-05, "loss": 0.9718, "step": 1998 }, { "epoch": 0.15563232932704005, "grad_norm": 2.425992319440732, "learning_rate": 3.8139202045560325e-05, "loss": 0.8806, "step": 1999 }, { "epoch": 0.15571018441924966, "grad_norm": 2.545232552078083, "learning_rate": 3.813709884964394e-05, "loss": 0.9787, "step": 2000 }, { "epoch": 0.15571018441924966, "eval_loss": 0.11895980685949326, "eval_runtime": 162.277, "eval_samples_per_second": 17.747, "eval_steps_per_second": 0.635, "step": 2000 }, { "epoch": 0.1557880395114593, "grad_norm": 2.534306993846759, "learning_rate": 3.813499452387155e-05, "loss": 0.9177, "step": 2001 }, { "epoch": 0.15586589460366893, "grad_norm": 2.7177220318561766, "learning_rate": 3.8132889068374246e-05, "loss": 0.9276, "step": 2002 }, { "epoch": 0.15594374969587854, "grad_norm": 2.395483972187887, "learning_rate": 3.813078248328318e-05, "loss": 0.9543, "step": 2003 }, { "epoch": 0.15602160478808816, "grad_norm": 2.843045838188893, "learning_rate": 3.812867476872959e-05, "loss": 0.9272, "step": 2004 }, { "epoch": 0.1560994598802978, "grad_norm": 2.9537929503662337, "learning_rate": 3.812656592484478e-05, "loss": 0.9553, "step": 2005 }, { "epoch": 0.15617731497250742, "grad_norm": 2.6354637740230533, "learning_rate": 3.812445595176011e-05, "loss": 0.9481, "step": 2006 }, { "epoch": 0.15625517006471704, "grad_norm": 2.85459039037015, "learning_rate": 3.8122344849607026e-05, "loss": 1.0037, "step": 2007 }, { "epoch": 0.15633302515692668, "grad_norm": 3.0313830993474813, "learning_rate": 3.812023261851704e-05, "loss": 0.9934, "step": 2008 }, { "epoch": 0.1564108802491363, "grad_norm": 2.5003443777337164, "learning_rate": 3.811811925862173e-05, "loss": 0.884, "step": 2009 }, { "epoch": 0.1564887353413459, "grad_norm": 2.708044228630356, "learning_rate": 3.811600477005277e-05, "loss": 0.9259, "step": 2010 }, { "epoch": 0.15656659043355556, "grad_norm": 2.7310969091569577, "learning_rate": 3.8113889152941845e-05, "loss": 0.9248, "step": 2011 }, { "epoch": 0.15664444552576517, "grad_norm": 2.782093760648851, "learning_rate": 3.811177240742078e-05, "loss": 1.0567, "step": 2012 }, { "epoch": 0.1567223006179748, "grad_norm": 2.784951901057765, "learning_rate": 3.8109654533621425e-05, "loss": 1.0034, "step": 2013 }, { "epoch": 0.1568001557101844, "grad_norm": 2.5628380293474002, "learning_rate": 3.810753553167572e-05, "loss": 0.9377, "step": 2014 }, { "epoch": 0.15687801080239405, "grad_norm": 2.3249173028332066, "learning_rate": 3.810541540171565e-05, "loss": 0.9245, "step": 2015 }, { "epoch": 0.15695586589460367, "grad_norm": 2.519393623277049, "learning_rate": 3.810329414387331e-05, "loss": 0.9526, "step": 2016 }, { "epoch": 0.15703372098681329, "grad_norm": 2.61658689989093, "learning_rate": 3.810117175828085e-05, "loss": 1.0305, "step": 2017 }, { "epoch": 0.15711157607902293, "grad_norm": 2.317888035607777, "learning_rate": 3.809904824507046e-05, "loss": 0.961, "step": 2018 }, { "epoch": 0.15718943117123255, "grad_norm": 2.3798359611760618, "learning_rate": 3.809692360437444e-05, "loss": 0.9176, "step": 2019 }, { "epoch": 0.15726728626344216, "grad_norm": 2.402470047322003, "learning_rate": 3.809479783632514e-05, "loss": 0.9164, "step": 2020 }, { "epoch": 0.15734514135565178, "grad_norm": 2.528570343445851, "learning_rate": 3.809267094105499e-05, "loss": 1.0161, "step": 2021 }, { "epoch": 0.15742299644786142, "grad_norm": 2.2786052054077377, "learning_rate": 3.809054291869649e-05, "loss": 0.8975, "step": 2022 }, { "epoch": 0.15750085154007104, "grad_norm": 2.5336786496355326, "learning_rate": 3.8088413769382186e-05, "loss": 0.9114, "step": 2023 }, { "epoch": 0.15757870663228066, "grad_norm": 2.353901787222624, "learning_rate": 3.8086283493244736e-05, "loss": 0.8736, "step": 2024 }, { "epoch": 0.1576565617244903, "grad_norm": 2.3085956423734553, "learning_rate": 3.808415209041684e-05, "loss": 0.8983, "step": 2025 }, { "epoch": 0.15773441681669992, "grad_norm": 2.3362648398168577, "learning_rate": 3.808201956103126e-05, "loss": 0.8352, "step": 2026 }, { "epoch": 0.15781227190890953, "grad_norm": 2.5856061510701562, "learning_rate": 3.8079885905220865e-05, "loss": 0.9662, "step": 2027 }, { "epoch": 0.15789012700111918, "grad_norm": 2.476924648945529, "learning_rate": 3.807775112311856e-05, "loss": 0.9444, "step": 2028 }, { "epoch": 0.1579679820933288, "grad_norm": 2.3717582163795554, "learning_rate": 3.8075615214857327e-05, "loss": 0.9517, "step": 2029 }, { "epoch": 0.1580458371855384, "grad_norm": 2.463250021494809, "learning_rate": 3.8073478180570233e-05, "loss": 0.9322, "step": 2030 }, { "epoch": 0.15812369227774803, "grad_norm": 2.466141446188845, "learning_rate": 3.8071340020390406e-05, "loss": 0.9199, "step": 2031 }, { "epoch": 0.15820154736995767, "grad_norm": 2.44137679008329, "learning_rate": 3.806920073445104e-05, "loss": 0.9245, "step": 2032 }, { "epoch": 0.1582794024621673, "grad_norm": 2.558969590198992, "learning_rate": 3.8067060322885386e-05, "loss": 1.0178, "step": 2033 }, { "epoch": 0.1583572575543769, "grad_norm": 2.5739018694042795, "learning_rate": 3.8064918785826805e-05, "loss": 0.9719, "step": 2034 }, { "epoch": 0.15843511264658655, "grad_norm": 2.442241786679264, "learning_rate": 3.8062776123408694e-05, "loss": 0.8846, "step": 2035 }, { "epoch": 0.15851296773879617, "grad_norm": 2.3627902508349927, "learning_rate": 3.806063233576454e-05, "loss": 0.8998, "step": 2036 }, { "epoch": 0.15859082283100578, "grad_norm": 2.5788693732880583, "learning_rate": 3.805848742302788e-05, "loss": 0.9269, "step": 2037 }, { "epoch": 0.15866867792321543, "grad_norm": 2.4863469336416957, "learning_rate": 3.805634138533234e-05, "loss": 0.9439, "step": 2038 }, { "epoch": 0.15874653301542505, "grad_norm": 2.438747099515511, "learning_rate": 3.805419422281159e-05, "loss": 0.9715, "step": 2039 }, { "epoch": 0.15882438810763466, "grad_norm": 2.583458643646503, "learning_rate": 3.8052045935599414e-05, "loss": 0.9985, "step": 2040 }, { "epoch": 0.15890224319984428, "grad_norm": 2.5120660814730518, "learning_rate": 3.804989652382963e-05, "loss": 0.9498, "step": 2041 }, { "epoch": 0.15898009829205392, "grad_norm": 2.4586428042813804, "learning_rate": 3.804774598763613e-05, "loss": 0.9067, "step": 2042 }, { "epoch": 0.15905795338426354, "grad_norm": 2.4002851953326068, "learning_rate": 3.804559432715289e-05, "loss": 0.9866, "step": 2043 }, { "epoch": 0.15913580847647316, "grad_norm": 2.5281530229359666, "learning_rate": 3.804344154251394e-05, "loss": 1.005, "step": 2044 }, { "epoch": 0.1592136635686828, "grad_norm": 2.3762012590040285, "learning_rate": 3.80412876338534e-05, "loss": 0.8978, "step": 2045 }, { "epoch": 0.15929151866089242, "grad_norm": 2.309049373586401, "learning_rate": 3.803913260130543e-05, "loss": 0.8743, "step": 2046 }, { "epoch": 0.15936937375310203, "grad_norm": 2.532587041299762, "learning_rate": 3.80369764450043e-05, "loss": 0.9568, "step": 2047 }, { "epoch": 0.15944722884531165, "grad_norm": 2.5402611456454833, "learning_rate": 3.803481916508432e-05, "loss": 0.8997, "step": 2048 }, { "epoch": 0.1595250839375213, "grad_norm": 2.5002131736993087, "learning_rate": 3.803266076167987e-05, "loss": 0.8828, "step": 2049 }, { "epoch": 0.1596029390297309, "grad_norm": 2.3850491193718963, "learning_rate": 3.8030501234925425e-05, "loss": 0.9325, "step": 2050 }, { "epoch": 0.1596029390297309, "eval_loss": 0.11618960648775101, "eval_runtime": 162.3493, "eval_samples_per_second": 17.74, "eval_steps_per_second": 0.634, "step": 2050 }, { "epoch": 0.15968079412194053, "grad_norm": 2.434426640772001, "learning_rate": 3.80283405849555e-05, "loss": 0.9355, "step": 2051 }, { "epoch": 0.15975864921415017, "grad_norm": 2.3918566525842393, "learning_rate": 3.8026178811904694e-05, "loss": 0.9306, "step": 2052 }, { "epoch": 0.1598365043063598, "grad_norm": 2.2884370343889064, "learning_rate": 3.802401591590768e-05, "loss": 0.939, "step": 2053 }, { "epoch": 0.1599143593985694, "grad_norm": 2.4632338423361504, "learning_rate": 3.8021851897099196e-05, "loss": 0.9471, "step": 2054 }, { "epoch": 0.15999221449077905, "grad_norm": 2.4287278525485196, "learning_rate": 3.801968675561405e-05, "loss": 0.9332, "step": 2055 }, { "epoch": 0.16007006958298867, "grad_norm": 2.3423559999269967, "learning_rate": 3.801752049158712e-05, "loss": 0.8645, "step": 2056 }, { "epoch": 0.16014792467519828, "grad_norm": 2.7216316258985946, "learning_rate": 3.801535310515334e-05, "loss": 0.9924, "step": 2057 }, { "epoch": 0.1602257797674079, "grad_norm": 2.4585930986897777, "learning_rate": 3.801318459644776e-05, "loss": 0.9447, "step": 2058 }, { "epoch": 0.16030363485961754, "grad_norm": 2.6128642276271563, "learning_rate": 3.8011014965605435e-05, "loss": 0.9714, "step": 2059 }, { "epoch": 0.16038148995182716, "grad_norm": 2.4560600673701543, "learning_rate": 3.8008844212761545e-05, "loss": 0.9456, "step": 2060 }, { "epoch": 0.16045934504403678, "grad_norm": 2.5541389860645136, "learning_rate": 3.800667233805131e-05, "loss": 0.9468, "step": 2061 }, { "epoch": 0.16053720013624642, "grad_norm": 2.5381672769666075, "learning_rate": 3.800449934161002e-05, "loss": 0.9337, "step": 2062 }, { "epoch": 0.16061505522845604, "grad_norm": 2.4289363201178613, "learning_rate": 3.8002325223573054e-05, "loss": 0.9392, "step": 2063 }, { "epoch": 0.16069291032066566, "grad_norm": 2.444809799108547, "learning_rate": 3.800014998407585e-05, "loss": 0.9286, "step": 2064 }, { "epoch": 0.1607707654128753, "grad_norm": 2.459554681477483, "learning_rate": 3.79979736232539e-05, "loss": 0.8966, "step": 2065 }, { "epoch": 0.16084862050508492, "grad_norm": 2.5947411147707466, "learning_rate": 3.79957961412428e-05, "loss": 0.9599, "step": 2066 }, { "epoch": 0.16092647559729453, "grad_norm": 2.216464175660971, "learning_rate": 3.799361753817819e-05, "loss": 0.8765, "step": 2067 }, { "epoch": 0.16100433068950415, "grad_norm": 2.382841069357389, "learning_rate": 3.7991437814195775e-05, "loss": 0.8834, "step": 2068 }, { "epoch": 0.1610821857817138, "grad_norm": 2.537662067990666, "learning_rate": 3.7989256969431355e-05, "loss": 0.8543, "step": 2069 }, { "epoch": 0.1611600408739234, "grad_norm": 2.510214593864412, "learning_rate": 3.798707500402078e-05, "loss": 0.8799, "step": 2070 }, { "epoch": 0.16123789596613303, "grad_norm": 2.3341612778000056, "learning_rate": 3.7984891918099985e-05, "loss": 0.8991, "step": 2071 }, { "epoch": 0.16131575105834267, "grad_norm": 2.7454423158414754, "learning_rate": 3.798270771180496e-05, "loss": 1.0126, "step": 2072 }, { "epoch": 0.1613936061505523, "grad_norm": 2.443351324006353, "learning_rate": 3.798052238527177e-05, "loss": 0.9734, "step": 2073 }, { "epoch": 0.1614714612427619, "grad_norm": 2.6520562263702314, "learning_rate": 3.797833593863655e-05, "loss": 0.991, "step": 2074 }, { "epoch": 0.16154931633497152, "grad_norm": 2.412898238551514, "learning_rate": 3.797614837203551e-05, "loss": 0.9508, "step": 2075 }, { "epoch": 0.16162717142718117, "grad_norm": 2.408877002043142, "learning_rate": 3.7973959685604914e-05, "loss": 0.9322, "step": 2076 }, { "epoch": 0.16170502651939078, "grad_norm": 2.366858106157414, "learning_rate": 3.7971769879481116e-05, "loss": 0.9071, "step": 2077 }, { "epoch": 0.1617828816116004, "grad_norm": 2.507173678572951, "learning_rate": 3.796957895380053e-05, "loss": 0.9486, "step": 2078 }, { "epoch": 0.16186073670381004, "grad_norm": 2.5310157742108674, "learning_rate": 3.796738690869964e-05, "loss": 0.9387, "step": 2079 }, { "epoch": 0.16193859179601966, "grad_norm": 2.5208381303418212, "learning_rate": 3.7965193744315004e-05, "loss": 0.8814, "step": 2080 }, { "epoch": 0.16201644688822928, "grad_norm": 2.7218412448972553, "learning_rate": 3.796299946078324e-05, "loss": 0.9541, "step": 2081 }, { "epoch": 0.16209430198043892, "grad_norm": 2.6904918004606717, "learning_rate": 3.7960804058241045e-05, "loss": 0.9252, "step": 2082 }, { "epoch": 0.16217215707264854, "grad_norm": 2.5859626589168894, "learning_rate": 3.795860753682518e-05, "loss": 0.9605, "step": 2083 }, { "epoch": 0.16225001216485815, "grad_norm": 2.4632180687063836, "learning_rate": 3.795640989667247e-05, "loss": 0.9331, "step": 2084 }, { "epoch": 0.16232786725706777, "grad_norm": 2.4088376085243577, "learning_rate": 3.7954211137919834e-05, "loss": 0.8919, "step": 2085 }, { "epoch": 0.16240572234927741, "grad_norm": 2.4280472232592576, "learning_rate": 3.7952011260704234e-05, "loss": 0.8783, "step": 2086 }, { "epoch": 0.16248357744148703, "grad_norm": 2.7878173976421063, "learning_rate": 3.7949810265162717e-05, "loss": 1.0027, "step": 2087 }, { "epoch": 0.16256143253369665, "grad_norm": 2.4458891729548102, "learning_rate": 3.7947608151432385e-05, "loss": 1.0027, "step": 2088 }, { "epoch": 0.1626392876259063, "grad_norm": 2.408848895665559, "learning_rate": 3.794540491965044e-05, "loss": 0.8917, "step": 2089 }, { "epoch": 0.1627171427181159, "grad_norm": 2.3129141653979155, "learning_rate": 3.7943200569954104e-05, "loss": 0.8366, "step": 2090 }, { "epoch": 0.16279499781032553, "grad_norm": 2.413668550694571, "learning_rate": 3.794099510248072e-05, "loss": 0.8583, "step": 2091 }, { "epoch": 0.16287285290253517, "grad_norm": 2.4779872331220445, "learning_rate": 3.7938788517367674e-05, "loss": 0.9184, "step": 2092 }, { "epoch": 0.1629507079947448, "grad_norm": 2.465141096718966, "learning_rate": 3.793658081475242e-05, "loss": 0.8705, "step": 2093 }, { "epoch": 0.1630285630869544, "grad_norm": 2.5004687824001057, "learning_rate": 3.793437199477249e-05, "loss": 0.8448, "step": 2094 }, { "epoch": 0.16310641817916402, "grad_norm": 2.3895741204388923, "learning_rate": 3.793216205756548e-05, "loss": 0.87, "step": 2095 }, { "epoch": 0.16318427327137366, "grad_norm": 2.6208260082337893, "learning_rate": 3.792995100326907e-05, "loss": 0.8797, "step": 2096 }, { "epoch": 0.16326212836358328, "grad_norm": 2.554091384391887, "learning_rate": 3.7927738832020986e-05, "loss": 0.8549, "step": 2097 }, { "epoch": 0.1633399834557929, "grad_norm": 2.5778615548084725, "learning_rate": 3.7925525543959046e-05, "loss": 0.8978, "step": 2098 }, { "epoch": 0.16341783854800254, "grad_norm": 2.60756365357567, "learning_rate": 3.7923311139221114e-05, "loss": 0.9518, "step": 2099 }, { "epoch": 0.16349569364021216, "grad_norm": 2.5073412765558403, "learning_rate": 3.792109561794515e-05, "loss": 0.9079, "step": 2100 }, { "epoch": 0.16349569364021216, "eval_loss": 0.11224610358476639, "eval_runtime": 162.5007, "eval_samples_per_second": 17.723, "eval_steps_per_second": 0.634, "step": 2100 }, { "epoch": 0.16357354873242178, "grad_norm": 2.3233895223038483, "learning_rate": 3.791887898026916e-05, "loss": 0.8889, "step": 2101 }, { "epoch": 0.1636514038246314, "grad_norm": 2.42015444693406, "learning_rate": 3.7916661226331245e-05, "loss": 0.8129, "step": 2102 }, { "epoch": 0.16372925891684104, "grad_norm": 2.263191638762397, "learning_rate": 3.791444235626955e-05, "loss": 0.8756, "step": 2103 }, { "epoch": 0.16380711400905065, "grad_norm": 2.650300366649692, "learning_rate": 3.79122223702223e-05, "loss": 0.9027, "step": 2104 }, { "epoch": 0.16388496910126027, "grad_norm": 2.3867851099484585, "learning_rate": 3.791000126832779e-05, "loss": 0.8797, "step": 2105 }, { "epoch": 0.1639628241934699, "grad_norm": 2.484311448198109, "learning_rate": 3.790777905072438e-05, "loss": 0.9079, "step": 2106 }, { "epoch": 0.16404067928567953, "grad_norm": 2.6126867708973944, "learning_rate": 3.790555571755051e-05, "loss": 0.947, "step": 2107 }, { "epoch": 0.16411853437788915, "grad_norm": 2.4714893534151234, "learning_rate": 3.790333126894469e-05, "loss": 0.9105, "step": 2108 }, { "epoch": 0.1641963894700988, "grad_norm": 2.362055002534653, "learning_rate": 3.790110570504548e-05, "loss": 0.8704, "step": 2109 }, { "epoch": 0.1642742445623084, "grad_norm": 2.5887380124036614, "learning_rate": 3.789887902599153e-05, "loss": 0.9166, "step": 2110 }, { "epoch": 0.16435209965451802, "grad_norm": 2.3856032336268527, "learning_rate": 3.789665123192155e-05, "loss": 0.8918, "step": 2111 }, { "epoch": 0.16442995474672764, "grad_norm": 2.2879836821645605, "learning_rate": 3.7894422322974314e-05, "loss": 0.8401, "step": 2112 }, { "epoch": 0.16450780983893729, "grad_norm": 2.436480307826747, "learning_rate": 3.7892192299288685e-05, "loss": 0.8979, "step": 2113 }, { "epoch": 0.1645856649311469, "grad_norm": 2.6984942477639278, "learning_rate": 3.7889961161003573e-05, "loss": 0.9785, "step": 2114 }, { "epoch": 0.16466352002335652, "grad_norm": 2.2345820191553507, "learning_rate": 3.788772890825797e-05, "loss": 0.8545, "step": 2115 }, { "epoch": 0.16474137511556616, "grad_norm": 2.2332522027125767, "learning_rate": 3.788549554119094e-05, "loss": 0.8658, "step": 2116 }, { "epoch": 0.16481923020777578, "grad_norm": 2.490801369188995, "learning_rate": 3.78832610599416e-05, "loss": 0.9472, "step": 2117 }, { "epoch": 0.1648970852999854, "grad_norm": 2.4091291126449765, "learning_rate": 3.788102546464916e-05, "loss": 0.8685, "step": 2118 }, { "epoch": 0.16497494039219504, "grad_norm": 2.56564129126847, "learning_rate": 3.7878788755452885e-05, "loss": 0.9273, "step": 2119 }, { "epoch": 0.16505279548440466, "grad_norm": 2.6925138066947247, "learning_rate": 3.787655093249209e-05, "loss": 0.9626, "step": 2120 }, { "epoch": 0.16513065057661427, "grad_norm": 2.453210326014291, "learning_rate": 3.7874311995906226e-05, "loss": 0.858, "step": 2121 }, { "epoch": 0.1652085056688239, "grad_norm": 2.6004434508287257, "learning_rate": 3.7872071945834726e-05, "loss": 0.9306, "step": 2122 }, { "epoch": 0.16528636076103354, "grad_norm": 2.6356270037908742, "learning_rate": 3.786983078241715e-05, "loss": 0.9451, "step": 2123 }, { "epoch": 0.16536421585324315, "grad_norm": 2.4852060605420703, "learning_rate": 3.786758850579311e-05, "loss": 0.9253, "step": 2124 }, { "epoch": 0.16544207094545277, "grad_norm": 2.475539118568146, "learning_rate": 3.78653451161023e-05, "loss": 0.8973, "step": 2125 }, { "epoch": 0.1655199260376624, "grad_norm": 2.3323666143847146, "learning_rate": 3.786310061348446e-05, "loss": 0.8648, "step": 2126 }, { "epoch": 0.16559778112987203, "grad_norm": 2.6200491777159978, "learning_rate": 3.786085499807941e-05, "loss": 0.9328, "step": 2127 }, { "epoch": 0.16567563622208165, "grad_norm": 2.4940017938311216, "learning_rate": 3.7858608270027055e-05, "loss": 0.8717, "step": 2128 }, { "epoch": 0.16575349131429126, "grad_norm": 2.2034580143673406, "learning_rate": 3.7856360429467344e-05, "loss": 0.8178, "step": 2129 }, { "epoch": 0.1658313464065009, "grad_norm": 2.375548795139312, "learning_rate": 3.785411147654031e-05, "loss": 0.8357, "step": 2130 }, { "epoch": 0.16590920149871052, "grad_norm": 2.5021685457702136, "learning_rate": 3.7851861411386056e-05, "loss": 0.9535, "step": 2131 }, { "epoch": 0.16598705659092014, "grad_norm": 2.3281402807064575, "learning_rate": 3.784961023414474e-05, "loss": 0.8255, "step": 2132 }, { "epoch": 0.16606491168312978, "grad_norm": 2.3818650440747304, "learning_rate": 3.784735794495662e-05, "loss": 0.9003, "step": 2133 }, { "epoch": 0.1661427667753394, "grad_norm": 2.3967767683281056, "learning_rate": 3.7845104543961976e-05, "loss": 0.8207, "step": 2134 }, { "epoch": 0.16622062186754902, "grad_norm": 2.484206376706807, "learning_rate": 3.784285003130121e-05, "loss": 0.9056, "step": 2135 }, { "epoch": 0.16629847695975866, "grad_norm": 2.434380792256099, "learning_rate": 3.7840594407114746e-05, "loss": 0.8532, "step": 2136 }, { "epoch": 0.16637633205196828, "grad_norm": 2.443504199385652, "learning_rate": 3.783833767154312e-05, "loss": 0.9012, "step": 2137 }, { "epoch": 0.1664541871441779, "grad_norm": 2.2395191123171867, "learning_rate": 3.7836079824726895e-05, "loss": 0.8679, "step": 2138 }, { "epoch": 0.1665320422363875, "grad_norm": 2.4001787081725277, "learning_rate": 3.783382086680673e-05, "loss": 0.8131, "step": 2139 }, { "epoch": 0.16660989732859716, "grad_norm": 2.4791721857799733, "learning_rate": 3.783156079792336e-05, "loss": 0.8905, "step": 2140 }, { "epoch": 0.16668775242080677, "grad_norm": 2.2978107546400475, "learning_rate": 3.782929961821757e-05, "loss": 0.8684, "step": 2141 }, { "epoch": 0.1667656075130164, "grad_norm": 2.4732887669242225, "learning_rate": 3.782703732783021e-05, "loss": 0.8527, "step": 2142 }, { "epoch": 0.16684346260522603, "grad_norm": 2.572039601252694, "learning_rate": 3.782477392690222e-05, "loss": 0.9472, "step": 2143 }, { "epoch": 0.16692131769743565, "grad_norm": 2.4659582729831366, "learning_rate": 3.782250941557461e-05, "loss": 0.8865, "step": 2144 }, { "epoch": 0.16699917278964527, "grad_norm": 2.4604165417111648, "learning_rate": 3.782024379398842e-05, "loss": 0.8885, "step": 2145 }, { "epoch": 0.16707702788185488, "grad_norm": 2.373573349476085, "learning_rate": 3.781797706228482e-05, "loss": 0.8181, "step": 2146 }, { "epoch": 0.16715488297406453, "grad_norm": 2.53025684154379, "learning_rate": 3.781570922060499e-05, "loss": 0.9086, "step": 2147 }, { "epoch": 0.16723273806627414, "grad_norm": 2.4826094396467604, "learning_rate": 3.781344026909022e-05, "loss": 0.8885, "step": 2148 }, { "epoch": 0.16731059315848376, "grad_norm": 2.4967003065156894, "learning_rate": 3.7811170207881854e-05, "loss": 0.895, "step": 2149 }, { "epoch": 0.1673884482506934, "grad_norm": 2.430568764534328, "learning_rate": 3.780889903712131e-05, "loss": 0.8616, "step": 2150 }, { "epoch": 0.1673884482506934, "eval_loss": 0.10843771696090698, "eval_runtime": 163.1656, "eval_samples_per_second": 17.651, "eval_steps_per_second": 0.631, "step": 2150 }, { "epoch": 0.16746630334290302, "grad_norm": 2.3077258599698713, "learning_rate": 3.780662675695005e-05, "loss": 0.7893, "step": 2151 }, { "epoch": 0.16754415843511264, "grad_norm": 2.1904763926212247, "learning_rate": 3.780435336750965e-05, "loss": 0.8124, "step": 2152 }, { "epoch": 0.16762201352732228, "grad_norm": 2.4777205666372986, "learning_rate": 3.780207886894172e-05, "loss": 0.9556, "step": 2153 }, { "epoch": 0.1676998686195319, "grad_norm": 2.3593277724297486, "learning_rate": 3.7799803261387956e-05, "loss": 0.8761, "step": 2154 }, { "epoch": 0.16777772371174152, "grad_norm": 2.3604894212297975, "learning_rate": 3.7797526544990115e-05, "loss": 0.831, "step": 2155 }, { "epoch": 0.16785557880395113, "grad_norm": 2.3949145145183723, "learning_rate": 3.779524871989003e-05, "loss": 0.838, "step": 2156 }, { "epoch": 0.16793343389616078, "grad_norm": 2.4653621137456248, "learning_rate": 3.7792969786229586e-05, "loss": 0.8369, "step": 2157 }, { "epoch": 0.1680112889883704, "grad_norm": 2.563361557031901, "learning_rate": 3.779068974415077e-05, "loss": 0.9105, "step": 2158 }, { "epoch": 0.16808914408058, "grad_norm": 2.493484745859675, "learning_rate": 3.7788408593795596e-05, "loss": 0.9455, "step": 2159 }, { "epoch": 0.16816699917278966, "grad_norm": 2.3128252539125436, "learning_rate": 3.778612633530618e-05, "loss": 0.8401, "step": 2160 }, { "epoch": 0.16824485426499927, "grad_norm": 2.244696044391344, "learning_rate": 3.77838429688247e-05, "loss": 0.8344, "step": 2161 }, { "epoch": 0.1683227093572089, "grad_norm": 2.5258888001494246, "learning_rate": 3.77815584944934e-05, "loss": 0.8876, "step": 2162 }, { "epoch": 0.16840056444941853, "grad_norm": 2.4542396400513793, "learning_rate": 3.7779272912454576e-05, "loss": 0.9237, "step": 2163 }, { "epoch": 0.16847841954162815, "grad_norm": 2.3067189358624813, "learning_rate": 3.777698622285062e-05, "loss": 0.8464, "step": 2164 }, { "epoch": 0.16855627463383777, "grad_norm": 2.478178138335767, "learning_rate": 3.7774698425823985e-05, "loss": 0.895, "step": 2165 }, { "epoch": 0.16863412972604738, "grad_norm": 2.4943993700676645, "learning_rate": 3.7772409521517185e-05, "loss": 0.9063, "step": 2166 }, { "epoch": 0.16871198481825703, "grad_norm": 2.3433614466352446, "learning_rate": 3.777011951007281e-05, "loss": 0.8526, "step": 2167 }, { "epoch": 0.16878983991046664, "grad_norm": 2.2840745845313957, "learning_rate": 3.776782839163352e-05, "loss": 0.8773, "step": 2168 }, { "epoch": 0.16886769500267626, "grad_norm": 2.3352963515414698, "learning_rate": 3.776553616634203e-05, "loss": 0.8042, "step": 2169 }, { "epoch": 0.1689455500948859, "grad_norm": 2.407761367427975, "learning_rate": 3.7763242834341154e-05, "loss": 0.8264, "step": 2170 }, { "epoch": 0.16902340518709552, "grad_norm": 2.4417239867755285, "learning_rate": 3.7760948395773737e-05, "loss": 0.8853, "step": 2171 }, { "epoch": 0.16910126027930514, "grad_norm": 2.5736094484443788, "learning_rate": 3.775865285078272e-05, "loss": 0.928, "step": 2172 }, { "epoch": 0.16917911537151475, "grad_norm": 2.441148447044499, "learning_rate": 3.77563561995111e-05, "loss": 0.9004, "step": 2173 }, { "epoch": 0.1692569704637244, "grad_norm": 2.8609328813948487, "learning_rate": 3.775405844210195e-05, "loss": 0.8491, "step": 2174 }, { "epoch": 0.16933482555593402, "grad_norm": 2.336889070412731, "learning_rate": 3.775175957869841e-05, "loss": 0.8049, "step": 2175 }, { "epoch": 0.16941268064814363, "grad_norm": 2.4066542008835077, "learning_rate": 3.77494596094437e-05, "loss": 0.836, "step": 2176 }, { "epoch": 0.16949053574035328, "grad_norm": 2.434881278636431, "learning_rate": 3.774715853448108e-05, "loss": 0.8906, "step": 2177 }, { "epoch": 0.1695683908325629, "grad_norm": 2.4759585068325523, "learning_rate": 3.77448563539539e-05, "loss": 0.8214, "step": 2178 }, { "epoch": 0.1696462459247725, "grad_norm": 2.472362348052906, "learning_rate": 3.7742553068005577e-05, "loss": 0.8813, "step": 2179 }, { "epoch": 0.16972410101698215, "grad_norm": 2.409275118137104, "learning_rate": 3.7740248676779595e-05, "loss": 0.7879, "step": 2180 }, { "epoch": 0.16980195610919177, "grad_norm": 2.40311760352492, "learning_rate": 3.773794318041951e-05, "loss": 0.8467, "step": 2181 }, { "epoch": 0.1698798112014014, "grad_norm": 2.6316781203829946, "learning_rate": 3.773563657906894e-05, "loss": 0.906, "step": 2182 }, { "epoch": 0.169957666293611, "grad_norm": 2.2469357122108113, "learning_rate": 3.7733328872871575e-05, "loss": 0.7904, "step": 2183 }, { "epoch": 0.17003552138582065, "grad_norm": 2.616046346947488, "learning_rate": 3.7731020061971177e-05, "loss": 0.938, "step": 2184 }, { "epoch": 0.17011337647803026, "grad_norm": 2.4652292661080595, "learning_rate": 3.772871014651157e-05, "loss": 0.8886, "step": 2185 }, { "epoch": 0.17019123157023988, "grad_norm": 2.322561167572615, "learning_rate": 3.772639912663665e-05, "loss": 0.8638, "step": 2186 }, { "epoch": 0.17026908666244953, "grad_norm": 2.6805416599147733, "learning_rate": 3.772408700249039e-05, "loss": 0.8242, "step": 2187 }, { "epoch": 0.17034694175465914, "grad_norm": 2.3189110345808186, "learning_rate": 3.7721773774216826e-05, "loss": 0.8334, "step": 2188 }, { "epoch": 0.17042479684686876, "grad_norm": 2.381507526295645, "learning_rate": 3.771945944196005e-05, "loss": 0.8493, "step": 2189 }, { "epoch": 0.1705026519390784, "grad_norm": 2.3598929532476, "learning_rate": 3.771714400586424e-05, "loss": 0.8443, "step": 2190 }, { "epoch": 0.17058050703128802, "grad_norm": 2.5392330276721684, "learning_rate": 3.771482746607363e-05, "loss": 0.9031, "step": 2191 }, { "epoch": 0.17065836212349764, "grad_norm": 2.660809882606931, "learning_rate": 3.771250982273254e-05, "loss": 0.8886, "step": 2192 }, { "epoch": 0.17073621721570725, "grad_norm": 2.3477492971517244, "learning_rate": 3.7710191075985346e-05, "loss": 0.8298, "step": 2193 }, { "epoch": 0.1708140723079169, "grad_norm": 2.63475030555767, "learning_rate": 3.77078712259765e-05, "loss": 0.9214, "step": 2194 }, { "epoch": 0.17089192740012651, "grad_norm": 2.545536582667565, "learning_rate": 3.77055502728505e-05, "loss": 0.9564, "step": 2195 }, { "epoch": 0.17096978249233613, "grad_norm": 2.3106898692145736, "learning_rate": 3.770322821675195e-05, "loss": 0.8055, "step": 2196 }, { "epoch": 0.17104763758454578, "grad_norm": 2.487721429128572, "learning_rate": 3.7700905057825486e-05, "loss": 0.8765, "step": 2197 }, { "epoch": 0.1711254926767554, "grad_norm": 2.3364819953734806, "learning_rate": 3.7698580796215846e-05, "loss": 0.8484, "step": 2198 }, { "epoch": 0.171203347768965, "grad_norm": 2.4525681773207877, "learning_rate": 3.769625543206781e-05, "loss": 0.8002, "step": 2199 }, { "epoch": 0.17128120286117463, "grad_norm": 2.4947795597612545, "learning_rate": 3.7693928965526234e-05, "loss": 0.863, "step": 2200 }, { "epoch": 0.17128120286117463, "eval_loss": 0.10665779560804367, "eval_runtime": 162.5456, "eval_samples_per_second": 17.718, "eval_steps_per_second": 0.634, "step": 2200 }, { "epoch": 0.17135905795338427, "grad_norm": 2.6677058359970283, "learning_rate": 3.7691601396736056e-05, "loss": 0.8378, "step": 2201 }, { "epoch": 0.17143691304559389, "grad_norm": 2.2402293162635107, "learning_rate": 3.7689272725842274e-05, "loss": 0.7582, "step": 2202 }, { "epoch": 0.1715147681378035, "grad_norm": 2.717010011912141, "learning_rate": 3.768694295298995e-05, "loss": 0.8851, "step": 2203 }, { "epoch": 0.17159262323001315, "grad_norm": 2.51216983094273, "learning_rate": 3.76846120783242e-05, "loss": 0.8784, "step": 2204 }, { "epoch": 0.17167047832222276, "grad_norm": 2.4741335065450487, "learning_rate": 3.768228010199026e-05, "loss": 0.8346, "step": 2205 }, { "epoch": 0.17174833341443238, "grad_norm": 2.546528532473841, "learning_rate": 3.767994702413337e-05, "loss": 0.8442, "step": 2206 }, { "epoch": 0.17182618850664202, "grad_norm": 2.385364203092747, "learning_rate": 3.76776128448989e-05, "loss": 0.8203, "step": 2207 }, { "epoch": 0.17190404359885164, "grad_norm": 2.289043054165318, "learning_rate": 3.7675277564432234e-05, "loss": 0.818, "step": 2208 }, { "epoch": 0.17198189869106126, "grad_norm": 2.2162170230598646, "learning_rate": 3.7672941182878854e-05, "loss": 0.7825, "step": 2209 }, { "epoch": 0.17205975378327087, "grad_norm": 2.6834088846091215, "learning_rate": 3.767060370038431e-05, "loss": 0.8313, "step": 2210 }, { "epoch": 0.17213760887548052, "grad_norm": 2.501563155672685, "learning_rate": 3.766826511709421e-05, "loss": 0.7809, "step": 2211 }, { "epoch": 0.17221546396769014, "grad_norm": 2.30707537762903, "learning_rate": 3.766592543315425e-05, "loss": 0.8018, "step": 2212 }, { "epoch": 0.17229331905989975, "grad_norm": 2.4832957401917435, "learning_rate": 3.766358464871017e-05, "loss": 0.8, "step": 2213 }, { "epoch": 0.1723711741521094, "grad_norm": 2.521482564136472, "learning_rate": 3.766124276390778e-05, "loss": 0.8744, "step": 2214 }, { "epoch": 0.172449029244319, "grad_norm": 2.4738263069750617, "learning_rate": 3.7658899778893e-05, "loss": 0.9343, "step": 2215 }, { "epoch": 0.17252688433652863, "grad_norm": 2.471778462557904, "learning_rate": 3.765655569381176e-05, "loss": 0.8573, "step": 2216 }, { "epoch": 0.17260473942873827, "grad_norm": 2.49254909125431, "learning_rate": 3.765421050881009e-05, "loss": 0.9202, "step": 2217 }, { "epoch": 0.1726825945209479, "grad_norm": 2.49461974817835, "learning_rate": 3.7651864224034095e-05, "loss": 0.8438, "step": 2218 }, { "epoch": 0.1727604496131575, "grad_norm": 2.4056534365158706, "learning_rate": 3.764951683962993e-05, "loss": 0.8999, "step": 2219 }, { "epoch": 0.17283830470536712, "grad_norm": 2.5196401671889896, "learning_rate": 3.7647168355743816e-05, "loss": 0.8392, "step": 2220 }, { "epoch": 0.17291615979757677, "grad_norm": 2.6655839214250436, "learning_rate": 3.764481877252207e-05, "loss": 0.8892, "step": 2221 }, { "epoch": 0.17299401488978638, "grad_norm": 2.5760628030136625, "learning_rate": 3.764246809011105e-05, "loss": 0.8526, "step": 2222 }, { "epoch": 0.173071869981996, "grad_norm": 2.715522525154826, "learning_rate": 3.76401163086572e-05, "loss": 0.9303, "step": 2223 }, { "epoch": 0.17314972507420565, "grad_norm": 2.3981079940106733, "learning_rate": 3.7637763428307014e-05, "loss": 0.8076, "step": 2224 }, { "epoch": 0.17322758016641526, "grad_norm": 2.567657836379994, "learning_rate": 3.763540944920707e-05, "loss": 0.8678, "step": 2225 }, { "epoch": 0.17330543525862488, "grad_norm": 2.5791137805699793, "learning_rate": 3.7633054371504014e-05, "loss": 0.871, "step": 2226 }, { "epoch": 0.1733832903508345, "grad_norm": 2.341285217424965, "learning_rate": 3.7630698195344554e-05, "loss": 0.7983, "step": 2227 }, { "epoch": 0.17346114544304414, "grad_norm": 2.544668509093836, "learning_rate": 3.7628340920875467e-05, "loss": 0.8581, "step": 2228 }, { "epoch": 0.17353900053525376, "grad_norm": 2.4353053057021863, "learning_rate": 3.7625982548243606e-05, "loss": 0.8652, "step": 2229 }, { "epoch": 0.17361685562746337, "grad_norm": 2.466911608207861, "learning_rate": 3.7623623077595875e-05, "loss": 0.7953, "step": 2230 }, { "epoch": 0.17369471071967302, "grad_norm": 2.4468551359987165, "learning_rate": 3.7621262509079264e-05, "loss": 0.8247, "step": 2231 }, { "epoch": 0.17377256581188263, "grad_norm": 2.527340210941719, "learning_rate": 3.7618900842840825e-05, "loss": 0.88, "step": 2232 }, { "epoch": 0.17385042090409225, "grad_norm": 2.5930818018502753, "learning_rate": 3.761653807902769e-05, "loss": 0.8666, "step": 2233 }, { "epoch": 0.1739282759963019, "grad_norm": 2.586531392753901, "learning_rate": 3.761417421778703e-05, "loss": 0.8617, "step": 2234 }, { "epoch": 0.1740061310885115, "grad_norm": 2.4081941757283114, "learning_rate": 3.761180925926611e-05, "loss": 0.7477, "step": 2235 }, { "epoch": 0.17408398618072113, "grad_norm": 2.2563553110200374, "learning_rate": 3.760944320361226e-05, "loss": 0.7713, "step": 2236 }, { "epoch": 0.17416184127293075, "grad_norm": 2.3726233060444653, "learning_rate": 3.7607076050972866e-05, "loss": 0.8327, "step": 2237 }, { "epoch": 0.1742396963651404, "grad_norm": 2.5461867677209264, "learning_rate": 3.7604707801495394e-05, "loss": 0.8237, "step": 2238 }, { "epoch": 0.17431755145735, "grad_norm": 2.1949421476640993, "learning_rate": 3.760233845532738e-05, "loss": 0.7593, "step": 2239 }, { "epoch": 0.17439540654955962, "grad_norm": 2.383940359818703, "learning_rate": 3.759996801261641e-05, "loss": 0.837, "step": 2240 }, { "epoch": 0.17447326164176927, "grad_norm": 2.3162579169074577, "learning_rate": 3.759759647351017e-05, "loss": 0.8051, "step": 2241 }, { "epoch": 0.17455111673397888, "grad_norm": 2.4501098235558074, "learning_rate": 3.759522383815638e-05, "loss": 0.8429, "step": 2242 }, { "epoch": 0.1746289718261885, "grad_norm": 2.4890786631476116, "learning_rate": 3.759285010670285e-05, "loss": 0.8642, "step": 2243 }, { "epoch": 0.17470682691839814, "grad_norm": 2.3133324510822404, "learning_rate": 3.7590475279297456e-05, "loss": 0.7899, "step": 2244 }, { "epoch": 0.17478468201060776, "grad_norm": 2.5069225751974984, "learning_rate": 3.7588099356088135e-05, "loss": 0.8731, "step": 2245 }, { "epoch": 0.17486253710281738, "grad_norm": 2.2385038779840025, "learning_rate": 3.7585722337222887e-05, "loss": 0.7605, "step": 2246 }, { "epoch": 0.174940392195027, "grad_norm": 2.650433931652383, "learning_rate": 3.7583344222849805e-05, "loss": 0.8902, "step": 2247 }, { "epoch": 0.17501824728723664, "grad_norm": 2.408247134751366, "learning_rate": 3.7580965013117026e-05, "loss": 0.8361, "step": 2248 }, { "epoch": 0.17509610237944626, "grad_norm": 2.58222792951655, "learning_rate": 3.757858470817276e-05, "loss": 0.8422, "step": 2249 }, { "epoch": 0.17517395747165587, "grad_norm": 2.562491632477033, "learning_rate": 3.75762033081653e-05, "loss": 0.8643, "step": 2250 }, { "epoch": 0.17517395747165587, "eval_loss": 0.10293591767549515, "eval_runtime": 162.429, "eval_samples_per_second": 17.731, "eval_steps_per_second": 0.634, "step": 2250 }, { "epoch": 0.17525181256386552, "grad_norm": 2.298876748929633, "learning_rate": 3.7573820813242984e-05, "loss": 0.8245, "step": 2251 }, { "epoch": 0.17532966765607513, "grad_norm": 2.5572254715197, "learning_rate": 3.757143722355424e-05, "loss": 0.8366, "step": 2252 }, { "epoch": 0.17540752274828475, "grad_norm": 2.4576636264517178, "learning_rate": 3.756905253924754e-05, "loss": 0.8341, "step": 2253 }, { "epoch": 0.17548537784049437, "grad_norm": 2.513044349130344, "learning_rate": 3.756666676047146e-05, "loss": 0.8812, "step": 2254 }, { "epoch": 0.175563232932704, "grad_norm": 2.5025754627190118, "learning_rate": 3.7564279887374604e-05, "loss": 0.8806, "step": 2255 }, { "epoch": 0.17564108802491363, "grad_norm": 2.2520425436225553, "learning_rate": 3.756189192010568e-05, "loss": 0.7869, "step": 2256 }, { "epoch": 0.17571894311712324, "grad_norm": 2.458278879748657, "learning_rate": 3.755950285881343e-05, "loss": 0.8102, "step": 2257 }, { "epoch": 0.1757967982093329, "grad_norm": 2.450709252423602, "learning_rate": 3.7557112703646694e-05, "loss": 0.8558, "step": 2258 }, { "epoch": 0.1758746533015425, "grad_norm": 2.2450892190000746, "learning_rate": 3.7554721454754356e-05, "loss": 0.7954, "step": 2259 }, { "epoch": 0.17595250839375212, "grad_norm": 2.491558711241801, "learning_rate": 3.755232911228539e-05, "loss": 0.8017, "step": 2260 }, { "epoch": 0.17603036348596177, "grad_norm": 2.4850466538993765, "learning_rate": 3.7549935676388816e-05, "loss": 0.8143, "step": 2261 }, { "epoch": 0.17610821857817138, "grad_norm": 2.3134980633059286, "learning_rate": 3.754754114721375e-05, "loss": 0.7991, "step": 2262 }, { "epoch": 0.176186073670381, "grad_norm": 2.3723302204676004, "learning_rate": 3.754514552490935e-05, "loss": 0.7122, "step": 2263 }, { "epoch": 0.17626392876259062, "grad_norm": 2.4135874616396236, "learning_rate": 3.754274880962485e-05, "loss": 0.7627, "step": 2264 }, { "epoch": 0.17634178385480026, "grad_norm": 2.2714539785160053, "learning_rate": 3.7540351001509565e-05, "loss": 0.7945, "step": 2265 }, { "epoch": 0.17641963894700988, "grad_norm": 2.407836426626062, "learning_rate": 3.7537952100712855e-05, "loss": 0.8242, "step": 2266 }, { "epoch": 0.1764974940392195, "grad_norm": 2.513347852993329, "learning_rate": 3.7535552107384156e-05, "loss": 0.8674, "step": 2267 }, { "epoch": 0.17657534913142914, "grad_norm": 2.34293199485879, "learning_rate": 3.753315102167299e-05, "loss": 0.8451, "step": 2268 }, { "epoch": 0.17665320422363875, "grad_norm": 2.3947363264972172, "learning_rate": 3.7530748843728944e-05, "loss": 0.7842, "step": 2269 }, { "epoch": 0.17673105931584837, "grad_norm": 2.3309292343004797, "learning_rate": 3.752834557370163e-05, "loss": 0.7919, "step": 2270 }, { "epoch": 0.17680891440805802, "grad_norm": 2.475360057115621, "learning_rate": 3.752594121174078e-05, "loss": 0.8351, "step": 2271 }, { "epoch": 0.17688676950026763, "grad_norm": 2.413620630165558, "learning_rate": 3.7523535757996174e-05, "loss": 0.8316, "step": 2272 }, { "epoch": 0.17696462459247725, "grad_norm": 2.4706802009171467, "learning_rate": 3.752112921261766e-05, "loss": 0.8129, "step": 2273 }, { "epoch": 0.17704247968468687, "grad_norm": 2.264860414816145, "learning_rate": 3.751872157575514e-05, "loss": 0.8363, "step": 2274 }, { "epoch": 0.1771203347768965, "grad_norm": 2.549592388746955, "learning_rate": 3.751631284755863e-05, "loss": 0.8751, "step": 2275 }, { "epoch": 0.17719818986910613, "grad_norm": 2.5776549418198673, "learning_rate": 3.751390302817815e-05, "loss": 0.8457, "step": 2276 }, { "epoch": 0.17727604496131574, "grad_norm": 2.287100450978305, "learning_rate": 3.7511492117763844e-05, "loss": 0.8173, "step": 2277 }, { "epoch": 0.1773539000535254, "grad_norm": 2.4495812401686954, "learning_rate": 3.750908011646588e-05, "loss": 0.8081, "step": 2278 }, { "epoch": 0.177431755145735, "grad_norm": 2.2859669998653307, "learning_rate": 3.750666702443453e-05, "loss": 0.7949, "step": 2279 }, { "epoch": 0.17750961023794462, "grad_norm": 2.5742263277374113, "learning_rate": 3.750425284182011e-05, "loss": 0.9302, "step": 2280 }, { "epoch": 0.17758746533015424, "grad_norm": 2.30221814782415, "learning_rate": 3.7501837568773024e-05, "loss": 0.81, "step": 2281 }, { "epoch": 0.17766532042236388, "grad_norm": 2.4496199787145634, "learning_rate": 3.7499421205443714e-05, "loss": 0.8958, "step": 2282 }, { "epoch": 0.1777431755145735, "grad_norm": 2.291173444884397, "learning_rate": 3.7497003751982723e-05, "loss": 0.7504, "step": 2283 }, { "epoch": 0.17782103060678311, "grad_norm": 2.5896339556037393, "learning_rate": 3.749458520854064e-05, "loss": 0.7841, "step": 2284 }, { "epoch": 0.17789888569899276, "grad_norm": 2.294585467320771, "learning_rate": 3.749216557526813e-05, "loss": 0.7596, "step": 2285 }, { "epoch": 0.17797674079120238, "grad_norm": 2.4769741077509804, "learning_rate": 3.748974485231592e-05, "loss": 0.8295, "step": 2286 }, { "epoch": 0.178054595883412, "grad_norm": 2.3474460303744333, "learning_rate": 3.748732303983482e-05, "loss": 0.7615, "step": 2287 }, { "epoch": 0.17813245097562164, "grad_norm": 2.4827365338740446, "learning_rate": 3.748490013797569e-05, "loss": 0.8258, "step": 2288 }, { "epoch": 0.17821030606783125, "grad_norm": 2.34579115680622, "learning_rate": 3.748247614688947e-05, "loss": 0.8064, "step": 2289 }, { "epoch": 0.17828816116004087, "grad_norm": 2.306185626987456, "learning_rate": 3.748005106672716e-05, "loss": 0.806, "step": 2290 }, { "epoch": 0.1783660162522505, "grad_norm": 2.4432182144601127, "learning_rate": 3.747762489763983e-05, "loss": 0.8369, "step": 2291 }, { "epoch": 0.17844387134446013, "grad_norm": 2.5173677466652844, "learning_rate": 3.7475197639778624e-05, "loss": 0.8532, "step": 2292 }, { "epoch": 0.17852172643666975, "grad_norm": 2.340061807190158, "learning_rate": 3.7472769293294746e-05, "loss": 0.7588, "step": 2293 }, { "epoch": 0.17859958152887936, "grad_norm": 2.485540759752307, "learning_rate": 3.747033985833947e-05, "loss": 0.8672, "step": 2294 }, { "epoch": 0.178677436621089, "grad_norm": 2.3946858123876655, "learning_rate": 3.7467909335064135e-05, "loss": 0.7885, "step": 2295 }, { "epoch": 0.17875529171329863, "grad_norm": 2.1520311753654355, "learning_rate": 3.746547772362016e-05, "loss": 0.6921, "step": 2296 }, { "epoch": 0.17883314680550824, "grad_norm": 2.427107804936546, "learning_rate": 3.746304502415901e-05, "loss": 0.7668, "step": 2297 }, { "epoch": 0.1789110018977179, "grad_norm": 2.463243861038909, "learning_rate": 3.746061123683225e-05, "loss": 0.8228, "step": 2298 }, { "epoch": 0.1789888569899275, "grad_norm": 2.5918971849849584, "learning_rate": 3.7458176361791476e-05, "loss": 0.8352, "step": 2299 }, { "epoch": 0.17906671208213712, "grad_norm": 2.41429190183338, "learning_rate": 3.745574039918838e-05, "loss": 0.8157, "step": 2300 }, { "epoch": 0.17906671208213712, "eval_loss": 0.10166560858488083, "eval_runtime": 162.5417, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.634, "step": 2300 }, { "epoch": 0.17914456717434674, "grad_norm": 2.169245553453764, "learning_rate": 3.745330334917469e-05, "loss": 0.7764, "step": 2301 }, { "epoch": 0.17922242226655638, "grad_norm": 2.388799203006203, "learning_rate": 3.745086521190226e-05, "loss": 0.809, "step": 2302 }, { "epoch": 0.179300277358766, "grad_norm": 2.254705683666534, "learning_rate": 3.7448425987522944e-05, "loss": 0.8215, "step": 2303 }, { "epoch": 0.1793781324509756, "grad_norm": 2.269665196126171, "learning_rate": 3.7445985676188705e-05, "loss": 0.7901, "step": 2304 }, { "epoch": 0.17945598754318526, "grad_norm": 2.410731617177937, "learning_rate": 3.744354427805156e-05, "loss": 0.7837, "step": 2305 }, { "epoch": 0.17953384263539487, "grad_norm": 2.3048765301711835, "learning_rate": 3.74411017932636e-05, "loss": 0.8124, "step": 2306 }, { "epoch": 0.1796116977276045, "grad_norm": 2.4673384811125767, "learning_rate": 3.7438658221976974e-05, "loss": 0.8612, "step": 2307 }, { "epoch": 0.1796895528198141, "grad_norm": 2.295169267997425, "learning_rate": 3.7436213564343915e-05, "loss": 0.7591, "step": 2308 }, { "epoch": 0.17976740791202375, "grad_norm": 2.246174042155016, "learning_rate": 3.7433767820516707e-05, "loss": 0.7874, "step": 2309 }, { "epoch": 0.17984526300423337, "grad_norm": 2.233449172320841, "learning_rate": 3.743132099064771e-05, "loss": 0.7244, "step": 2310 }, { "epoch": 0.17992311809644299, "grad_norm": 2.505833520284556, "learning_rate": 3.742887307488935e-05, "loss": 0.8605, "step": 2311 }, { "epoch": 0.18000097318865263, "grad_norm": 2.4896214630829827, "learning_rate": 3.742642407339412e-05, "loss": 0.8402, "step": 2312 }, { "epoch": 0.18007882828086225, "grad_norm": 2.1833302063457727, "learning_rate": 3.7423973986314585e-05, "loss": 0.7327, "step": 2313 }, { "epoch": 0.18015668337307186, "grad_norm": 2.315255935693474, "learning_rate": 3.7421522813803365e-05, "loss": 0.8113, "step": 2314 }, { "epoch": 0.1802345384652815, "grad_norm": 2.4046576503601, "learning_rate": 3.741907055601316e-05, "loss": 0.7814, "step": 2315 }, { "epoch": 0.18031239355749112, "grad_norm": 2.351971097426295, "learning_rate": 3.741661721309675e-05, "loss": 0.7302, "step": 2316 }, { "epoch": 0.18039024864970074, "grad_norm": 2.3287945300679644, "learning_rate": 3.741416278520694e-05, "loss": 0.77, "step": 2317 }, { "epoch": 0.18046810374191036, "grad_norm": 2.508380415848313, "learning_rate": 3.7411707272496644e-05, "loss": 0.7843, "step": 2318 }, { "epoch": 0.18054595883412, "grad_norm": 2.425188944980297, "learning_rate": 3.7409250675118826e-05, "loss": 0.7873, "step": 2319 }, { "epoch": 0.18062381392632962, "grad_norm": 2.3549633649874835, "learning_rate": 3.7406792993226525e-05, "loss": 0.8599, "step": 2320 }, { "epoch": 0.18070166901853923, "grad_norm": 2.4185337306019052, "learning_rate": 3.740433422697284e-05, "loss": 0.7429, "step": 2321 }, { "epoch": 0.18077952411074888, "grad_norm": 2.5210819513898066, "learning_rate": 3.740187437651094e-05, "loss": 0.7936, "step": 2322 }, { "epoch": 0.1808573792029585, "grad_norm": 2.298599391628089, "learning_rate": 3.739941344199406e-05, "loss": 0.7959, "step": 2323 }, { "epoch": 0.1809352342951681, "grad_norm": 2.4938733917366647, "learning_rate": 3.739695142357551e-05, "loss": 0.8305, "step": 2324 }, { "epoch": 0.18101308938737776, "grad_norm": 2.4112903369660974, "learning_rate": 3.739448832140866e-05, "loss": 0.7909, "step": 2325 }, { "epoch": 0.18109094447958737, "grad_norm": 2.331319451379721, "learning_rate": 3.739202413564695e-05, "loss": 0.8061, "step": 2326 }, { "epoch": 0.181168799571797, "grad_norm": 2.5158250316900093, "learning_rate": 3.738955886644388e-05, "loss": 0.7664, "step": 2327 }, { "epoch": 0.1812466546640066, "grad_norm": 2.463903539208326, "learning_rate": 3.7387092513953034e-05, "loss": 0.7827, "step": 2328 }, { "epoch": 0.18132450975621625, "grad_norm": 2.386318924557663, "learning_rate": 3.738462507832805e-05, "loss": 0.8, "step": 2329 }, { "epoch": 0.18140236484842587, "grad_norm": 2.307422253568008, "learning_rate": 3.738215655972264e-05, "loss": 0.7656, "step": 2330 }, { "epoch": 0.18148021994063548, "grad_norm": 2.3642952229359966, "learning_rate": 3.737968695829058e-05, "loss": 0.8095, "step": 2331 }, { "epoch": 0.18155807503284513, "grad_norm": 2.396348915714041, "learning_rate": 3.737721627418571e-05, "loss": 0.7944, "step": 2332 }, { "epoch": 0.18163593012505475, "grad_norm": 2.4956106815963905, "learning_rate": 3.737474450756195e-05, "loss": 0.8126, "step": 2333 }, { "epoch": 0.18171378521726436, "grad_norm": 2.474130962246907, "learning_rate": 3.737227165857328e-05, "loss": 0.7925, "step": 2334 }, { "epoch": 0.18179164030947398, "grad_norm": 2.488100500972795, "learning_rate": 3.7369797727373735e-05, "loss": 0.8064, "step": 2335 }, { "epoch": 0.18186949540168362, "grad_norm": 2.705723989591828, "learning_rate": 3.736732271411744e-05, "loss": 0.8495, "step": 2336 }, { "epoch": 0.18194735049389324, "grad_norm": 2.5144796522299533, "learning_rate": 3.736484661895858e-05, "loss": 0.8068, "step": 2337 }, { "epoch": 0.18202520558610286, "grad_norm": 2.247030115420733, "learning_rate": 3.7362369442051386e-05, "loss": 0.7749, "step": 2338 }, { "epoch": 0.1821030606783125, "grad_norm": 2.474651787640784, "learning_rate": 3.735989118355019e-05, "loss": 0.7911, "step": 2339 }, { "epoch": 0.18218091577052212, "grad_norm": 2.3211396339220376, "learning_rate": 3.7357411843609375e-05, "loss": 0.7157, "step": 2340 }, { "epoch": 0.18225877086273173, "grad_norm": 2.2732300542785087, "learning_rate": 3.735493142238339e-05, "loss": 0.7418, "step": 2341 }, { "epoch": 0.18233662595494138, "grad_norm": 2.466011395522677, "learning_rate": 3.735244992002675e-05, "loss": 0.8009, "step": 2342 }, { "epoch": 0.182414481047151, "grad_norm": 2.695221949078532, "learning_rate": 3.734996733669404e-05, "loss": 0.8646, "step": 2343 }, { "epoch": 0.1824923361393606, "grad_norm": 2.5746468599407333, "learning_rate": 3.734748367253992e-05, "loss": 0.825, "step": 2344 }, { "epoch": 0.18257019123157023, "grad_norm": 2.4980724677661796, "learning_rate": 3.734499892771911e-05, "loss": 0.7662, "step": 2345 }, { "epoch": 0.18264804632377987, "grad_norm": 2.426143847867281, "learning_rate": 3.734251310238639e-05, "loss": 0.7461, "step": 2346 }, { "epoch": 0.1827259014159895, "grad_norm": 2.257556670652366, "learning_rate": 3.7340026196696625e-05, "loss": 0.7297, "step": 2347 }, { "epoch": 0.1828037565081991, "grad_norm": 2.3592815434655, "learning_rate": 3.733753821080473e-05, "loss": 0.7762, "step": 2348 }, { "epoch": 0.18288161160040875, "grad_norm": 2.551711750544829, "learning_rate": 3.73350491448657e-05, "loss": 0.7944, "step": 2349 }, { "epoch": 0.18295946669261837, "grad_norm": 2.4474783287572834, "learning_rate": 3.7332558999034596e-05, "loss": 0.7911, "step": 2350 }, { "epoch": 0.18295946669261837, "eval_loss": 0.0968175008893013, "eval_runtime": 162.8621, "eval_samples_per_second": 17.684, "eval_steps_per_second": 0.632, "step": 2350 }, { "epoch": 0.18303732178482798, "grad_norm": 2.495621979530813, "learning_rate": 3.733006777346653e-05, "loss": 0.7474, "step": 2351 }, { "epoch": 0.18311517687703763, "grad_norm": 2.4118332524089743, "learning_rate": 3.732757546831671e-05, "loss": 0.7305, "step": 2352 }, { "epoch": 0.18319303196924724, "grad_norm": 2.3074418021790954, "learning_rate": 3.7325082083740364e-05, "loss": 0.7572, "step": 2353 }, { "epoch": 0.18327088706145686, "grad_norm": 2.306872057879337, "learning_rate": 3.732258761989286e-05, "loss": 0.7324, "step": 2354 }, { "epoch": 0.18334874215366648, "grad_norm": 2.442495641993301, "learning_rate": 3.732009207692956e-05, "loss": 0.8219, "step": 2355 }, { "epoch": 0.18342659724587612, "grad_norm": 2.3327245067272213, "learning_rate": 3.7317595455005936e-05, "loss": 0.7507, "step": 2356 }, { "epoch": 0.18350445233808574, "grad_norm": 2.4320283992041647, "learning_rate": 3.731509775427752e-05, "loss": 0.7379, "step": 2357 }, { "epoch": 0.18358230743029536, "grad_norm": 2.3639317515934812, "learning_rate": 3.7312598974899896e-05, "loss": 0.7816, "step": 2358 }, { "epoch": 0.183660162522505, "grad_norm": 2.5665980315753183, "learning_rate": 3.731009911702874e-05, "loss": 0.7808, "step": 2359 }, { "epoch": 0.18373801761471462, "grad_norm": 2.4949514345935326, "learning_rate": 3.7307598180819765e-05, "loss": 0.7825, "step": 2360 }, { "epoch": 0.18381587270692423, "grad_norm": 2.425713223384404, "learning_rate": 3.730509616642878e-05, "loss": 0.7871, "step": 2361 }, { "epoch": 0.18389372779913385, "grad_norm": 2.193678629398024, "learning_rate": 3.7302593074011643e-05, "loss": 0.7108, "step": 2362 }, { "epoch": 0.1839715828913435, "grad_norm": 2.3956890097945673, "learning_rate": 3.730008890372429e-05, "loss": 0.7761, "step": 2363 }, { "epoch": 0.1840494379835531, "grad_norm": 2.4925150234736506, "learning_rate": 3.7297583655722716e-05, "loss": 0.7882, "step": 2364 }, { "epoch": 0.18412729307576273, "grad_norm": 2.3434706403310064, "learning_rate": 3.7295077330162986e-05, "loss": 0.7861, "step": 2365 }, { "epoch": 0.18420514816797237, "grad_norm": 2.5027424763972967, "learning_rate": 3.729256992720124e-05, "loss": 0.8235, "step": 2366 }, { "epoch": 0.184283003260182, "grad_norm": 2.2290747236866646, "learning_rate": 3.7290061446993656e-05, "loss": 0.7236, "step": 2367 }, { "epoch": 0.1843608583523916, "grad_norm": 2.3507353661467256, "learning_rate": 3.728755188969652e-05, "loss": 0.7734, "step": 2368 }, { "epoch": 0.18443871344460125, "grad_norm": 2.2873796282535004, "learning_rate": 3.728504125546616e-05, "loss": 0.76, "step": 2369 }, { "epoch": 0.18451656853681087, "grad_norm": 2.4787054592785287, "learning_rate": 3.728252954445897e-05, "loss": 0.8222, "step": 2370 }, { "epoch": 0.18459442362902048, "grad_norm": 2.3754050270789757, "learning_rate": 3.728001675683143e-05, "loss": 0.8118, "step": 2371 }, { "epoch": 0.1846722787212301, "grad_norm": 2.2295369867045625, "learning_rate": 3.7277502892740074e-05, "loss": 0.7628, "step": 2372 }, { "epoch": 0.18475013381343974, "grad_norm": 2.2880469555882392, "learning_rate": 3.72749879523415e-05, "loss": 0.7764, "step": 2373 }, { "epoch": 0.18482798890564936, "grad_norm": 2.2918518287476095, "learning_rate": 3.7272471935792364e-05, "loss": 0.7883, "step": 2374 }, { "epoch": 0.18490584399785898, "grad_norm": 2.3283635313683475, "learning_rate": 3.726995484324942e-05, "loss": 0.8052, "step": 2375 }, { "epoch": 0.18498369909006862, "grad_norm": 2.455927751475157, "learning_rate": 3.726743667486947e-05, "loss": 0.7733, "step": 2376 }, { "epoch": 0.18506155418227824, "grad_norm": 2.24995304993423, "learning_rate": 3.7264917430809376e-05, "loss": 0.7687, "step": 2377 }, { "epoch": 0.18513940927448785, "grad_norm": 2.2804953461287694, "learning_rate": 3.726239711122608e-05, "loss": 0.7345, "step": 2378 }, { "epoch": 0.18521726436669747, "grad_norm": 2.318454654897747, "learning_rate": 3.7259875716276583e-05, "loss": 0.7603, "step": 2379 }, { "epoch": 0.18529511945890711, "grad_norm": 2.4381233568626612, "learning_rate": 3.725735324611796e-05, "loss": 0.7613, "step": 2380 }, { "epoch": 0.18537297455111673, "grad_norm": 2.3567267865762878, "learning_rate": 3.725482970090734e-05, "loss": 0.7748, "step": 2381 }, { "epoch": 0.18545082964332635, "grad_norm": 2.294846522332465, "learning_rate": 3.725230508080194e-05, "loss": 0.7913, "step": 2382 }, { "epoch": 0.185528684735536, "grad_norm": 2.2489973657506135, "learning_rate": 3.724977938595902e-05, "loss": 0.7247, "step": 2383 }, { "epoch": 0.1856065398277456, "grad_norm": 2.312112303684173, "learning_rate": 3.724725261653593e-05, "loss": 0.7523, "step": 2384 }, { "epoch": 0.18568439491995523, "grad_norm": 2.2411889961988543, "learning_rate": 3.724472477269007e-05, "loss": 0.7041, "step": 2385 }, { "epoch": 0.18576225001216487, "grad_norm": 2.3428720145512223, "learning_rate": 3.724219585457892e-05, "loss": 0.7482, "step": 2386 }, { "epoch": 0.1858401051043745, "grad_norm": 2.411602350788961, "learning_rate": 3.723966586236001e-05, "loss": 0.8029, "step": 2387 }, { "epoch": 0.1859179601965841, "grad_norm": 2.2831533670659083, "learning_rate": 3.723713479619094e-05, "loss": 0.7754, "step": 2388 }, { "epoch": 0.18599581528879372, "grad_norm": 2.484178150854712, "learning_rate": 3.723460265622941e-05, "loss": 0.8925, "step": 2389 }, { "epoch": 0.18607367038100336, "grad_norm": 2.4042297728067092, "learning_rate": 3.723206944263314e-05, "loss": 0.7284, "step": 2390 }, { "epoch": 0.18615152547321298, "grad_norm": 2.3935212650652495, "learning_rate": 3.7229535155559934e-05, "loss": 0.8426, "step": 2391 }, { "epoch": 0.1862293805654226, "grad_norm": 2.2883126200766033, "learning_rate": 3.722699979516768e-05, "loss": 0.7414, "step": 2392 }, { "epoch": 0.18630723565763224, "grad_norm": 2.2797668823568946, "learning_rate": 3.722446336161431e-05, "loss": 0.7722, "step": 2393 }, { "epoch": 0.18638509074984186, "grad_norm": 2.3314049385876827, "learning_rate": 3.722192585505784e-05, "loss": 0.7829, "step": 2394 }, { "epoch": 0.18646294584205148, "grad_norm": 2.354256286711737, "learning_rate": 3.721938727565634e-05, "loss": 0.7499, "step": 2395 }, { "epoch": 0.18654080093426112, "grad_norm": 2.2227808553837285, "learning_rate": 3.7216847623567944e-05, "loss": 0.7114, "step": 2396 }, { "epoch": 0.18661865602647074, "grad_norm": 2.2922674762577753, "learning_rate": 3.7214306898950866e-05, "loss": 0.7603, "step": 2397 }, { "epoch": 0.18669651111868035, "grad_norm": 2.2679688222152152, "learning_rate": 3.721176510196339e-05, "loss": 0.7216, "step": 2398 }, { "epoch": 0.18677436621088997, "grad_norm": 2.801234390261991, "learning_rate": 3.720922223276385e-05, "loss": 0.8607, "step": 2399 }, { "epoch": 0.1868522213030996, "grad_norm": 2.3627519678333613, "learning_rate": 3.7206678291510655e-05, "loss": 0.7642, "step": 2400 }, { "epoch": 0.1868522213030996, "eval_loss": 0.09447361528873444, "eval_runtime": 162.6211, "eval_samples_per_second": 17.71, "eval_steps_per_second": 0.633, "step": 2400 }, { "epoch": 0.18693007639530923, "grad_norm": 2.3194988392395324, "learning_rate": 3.7204133278362276e-05, "loss": 0.7226, "step": 2401 }, { "epoch": 0.18700793148751885, "grad_norm": 2.37193684977049, "learning_rate": 3.7201587193477264e-05, "loss": 0.7075, "step": 2402 }, { "epoch": 0.1870857865797285, "grad_norm": 2.282669052342881, "learning_rate": 3.7199040037014225e-05, "loss": 0.719, "step": 2403 }, { "epoch": 0.1871636416719381, "grad_norm": 2.3743407769670726, "learning_rate": 3.719649180913183e-05, "loss": 0.7623, "step": 2404 }, { "epoch": 0.18724149676414772, "grad_norm": 2.25632656242283, "learning_rate": 3.7193942509988825e-05, "loss": 0.7506, "step": 2405 }, { "epoch": 0.18731935185635734, "grad_norm": 2.6097598021441226, "learning_rate": 3.719139213974403e-05, "loss": 0.7989, "step": 2406 }, { "epoch": 0.18739720694856699, "grad_norm": 2.64360175780725, "learning_rate": 3.71888406985563e-05, "loss": 0.7817, "step": 2407 }, { "epoch": 0.1874750620407766, "grad_norm": 2.443983342017249, "learning_rate": 3.7186288186584594e-05, "loss": 0.8286, "step": 2408 }, { "epoch": 0.18755291713298622, "grad_norm": 2.277665896527969, "learning_rate": 3.718373460398792e-05, "loss": 0.7603, "step": 2409 }, { "epoch": 0.18763077222519586, "grad_norm": 2.457823005112283, "learning_rate": 3.7181179950925345e-05, "loss": 0.8115, "step": 2410 }, { "epoch": 0.18770862731740548, "grad_norm": 2.5485908229658585, "learning_rate": 3.717862422755603e-05, "loss": 0.7931, "step": 2411 }, { "epoch": 0.1877864824096151, "grad_norm": 2.300027686200299, "learning_rate": 3.717606743403916e-05, "loss": 0.7194, "step": 2412 }, { "epoch": 0.18786433750182474, "grad_norm": 2.544525337866324, "learning_rate": 3.717350957053402e-05, "loss": 0.8, "step": 2413 }, { "epoch": 0.18794219259403436, "grad_norm": 2.533418676323926, "learning_rate": 3.7170950637199966e-05, "loss": 0.8747, "step": 2414 }, { "epoch": 0.18802004768624397, "grad_norm": 2.43944774524497, "learning_rate": 3.71683906341964e-05, "loss": 0.8033, "step": 2415 }, { "epoch": 0.1880979027784536, "grad_norm": 2.3208123413951967, "learning_rate": 3.716582956168279e-05, "loss": 0.7793, "step": 2416 }, { "epoch": 0.18817575787066324, "grad_norm": 2.531887846592721, "learning_rate": 3.716326741981868e-05, "loss": 0.77, "step": 2417 }, { "epoch": 0.18825361296287285, "grad_norm": 2.2911158778381346, "learning_rate": 3.716070420876369e-05, "loss": 0.7304, "step": 2418 }, { "epoch": 0.18833146805508247, "grad_norm": 2.232550167861876, "learning_rate": 3.715813992867749e-05, "loss": 0.7317, "step": 2419 }, { "epoch": 0.1884093231472921, "grad_norm": 2.4456749195659038, "learning_rate": 3.715557457971983e-05, "loss": 0.7506, "step": 2420 }, { "epoch": 0.18848717823950173, "grad_norm": 2.393836644187456, "learning_rate": 3.7153008162050505e-05, "loss": 0.7476, "step": 2421 }, { "epoch": 0.18856503333171135, "grad_norm": 2.177500204434335, "learning_rate": 3.715044067582939e-05, "loss": 0.6933, "step": 2422 }, { "epoch": 0.188642888423921, "grad_norm": 2.2738525243502488, "learning_rate": 3.7147872121216446e-05, "loss": 0.7777, "step": 2423 }, { "epoch": 0.1887207435161306, "grad_norm": 2.2977421476195716, "learning_rate": 3.714530249837167e-05, "loss": 0.7297, "step": 2424 }, { "epoch": 0.18879859860834022, "grad_norm": 2.184136814832011, "learning_rate": 3.7142731807455134e-05, "loss": 0.6824, "step": 2425 }, { "epoch": 0.18887645370054984, "grad_norm": 2.3157834447681362, "learning_rate": 3.714016004862699e-05, "loss": 0.7815, "step": 2426 }, { "epoch": 0.18895430879275948, "grad_norm": 2.155728411740568, "learning_rate": 3.713758722204744e-05, "loss": 0.7491, "step": 2427 }, { "epoch": 0.1890321638849691, "grad_norm": 2.3090209128306154, "learning_rate": 3.7135013327876755e-05, "loss": 0.7832, "step": 2428 }, { "epoch": 0.18911001897717872, "grad_norm": 2.2050271759515323, "learning_rate": 3.713243836627528e-05, "loss": 0.6735, "step": 2429 }, { "epoch": 0.18918787406938836, "grad_norm": 2.1742306791458446, "learning_rate": 3.7129862337403426e-05, "loss": 0.719, "step": 2430 }, { "epoch": 0.18926572916159798, "grad_norm": 2.157407896732134, "learning_rate": 3.7127285241421675e-05, "loss": 0.7148, "step": 2431 }, { "epoch": 0.1893435842538076, "grad_norm": 2.4841061204310546, "learning_rate": 3.712470707849055e-05, "loss": 0.7666, "step": 2432 }, { "epoch": 0.1894214393460172, "grad_norm": 2.485281494038557, "learning_rate": 3.712212784877067e-05, "loss": 0.7798, "step": 2433 }, { "epoch": 0.18949929443822686, "grad_norm": 2.382702880077513, "learning_rate": 3.71195475524227e-05, "loss": 0.7144, "step": 2434 }, { "epoch": 0.18957714953043647, "grad_norm": 2.420444981377451, "learning_rate": 3.7116966189607394e-05, "loss": 0.7472, "step": 2435 }, { "epoch": 0.1896550046226461, "grad_norm": 2.4160310184133373, "learning_rate": 3.711438376048555e-05, "loss": 0.8544, "step": 2436 }, { "epoch": 0.18973285971485573, "grad_norm": 2.1618634269449233, "learning_rate": 3.711180026521804e-05, "loss": 0.7335, "step": 2437 }, { "epoch": 0.18981071480706535, "grad_norm": 2.3904248475535823, "learning_rate": 3.710921570396581e-05, "loss": 0.7838, "step": 2438 }, { "epoch": 0.18988856989927497, "grad_norm": 2.1326097847456755, "learning_rate": 3.710663007688986e-05, "loss": 0.7188, "step": 2439 }, { "epoch": 0.1899664249914846, "grad_norm": 2.251017352586258, "learning_rate": 3.710404338415126e-05, "loss": 0.7581, "step": 2440 }, { "epoch": 0.19004428008369423, "grad_norm": 2.3025033380216886, "learning_rate": 3.710145562591116e-05, "loss": 0.763, "step": 2441 }, { "epoch": 0.19012213517590384, "grad_norm": 2.455436637555831, "learning_rate": 3.7098866802330755e-05, "loss": 0.7829, "step": 2442 }, { "epoch": 0.19019999026811346, "grad_norm": 2.079764215575296, "learning_rate": 3.709627691357132e-05, "loss": 0.6795, "step": 2443 }, { "epoch": 0.1902778453603231, "grad_norm": 2.4111828784845395, "learning_rate": 3.7093685959794194e-05, "loss": 0.7817, "step": 2444 }, { "epoch": 0.19035570045253272, "grad_norm": 2.2789788999252707, "learning_rate": 3.709109394116077e-05, "loss": 0.7409, "step": 2445 }, { "epoch": 0.19043355554474234, "grad_norm": 2.16036273357868, "learning_rate": 3.7088500857832545e-05, "loss": 0.6985, "step": 2446 }, { "epoch": 0.19051141063695198, "grad_norm": 2.292867981268107, "learning_rate": 3.7085906709971036e-05, "loss": 0.7533, "step": 2447 }, { "epoch": 0.1905892657291616, "grad_norm": 2.2701895328334167, "learning_rate": 3.7083311497737845e-05, "loss": 0.725, "step": 2448 }, { "epoch": 0.19066712082137122, "grad_norm": 2.2864425728756355, "learning_rate": 3.708071522129465e-05, "loss": 0.7077, "step": 2449 }, { "epoch": 0.19074497591358086, "grad_norm": 2.2932128492869253, "learning_rate": 3.707811788080318e-05, "loss": 0.7599, "step": 2450 }, { "epoch": 0.19074497591358086, "eval_loss": 0.09202577918767929, "eval_runtime": 162.8768, "eval_samples_per_second": 17.682, "eval_steps_per_second": 0.632, "step": 2450 }, { "epoch": 0.19082283100579048, "grad_norm": 2.370072079474763, "learning_rate": 3.707551947642524e-05, "loss": 0.7141, "step": 2451 }, { "epoch": 0.1909006860980001, "grad_norm": 2.1870495695180225, "learning_rate": 3.7072920008322705e-05, "loss": 0.714, "step": 2452 }, { "epoch": 0.1909785411902097, "grad_norm": 2.2733414949347814, "learning_rate": 3.707031947665749e-05, "loss": 0.6887, "step": 2453 }, { "epoch": 0.19105639628241936, "grad_norm": 2.153972761878918, "learning_rate": 3.706771788159162e-05, "loss": 0.7473, "step": 2454 }, { "epoch": 0.19113425137462897, "grad_norm": 2.163688549719391, "learning_rate": 3.7065115223287154e-05, "loss": 0.7168, "step": 2455 }, { "epoch": 0.1912121064668386, "grad_norm": 2.3015772038706506, "learning_rate": 3.706251150190622e-05, "loss": 0.766, "step": 2456 }, { "epoch": 0.19128996155904823, "grad_norm": 2.2248111368880172, "learning_rate": 3.705990671761101e-05, "loss": 0.7275, "step": 2457 }, { "epoch": 0.19136781665125785, "grad_norm": 2.402545897717582, "learning_rate": 3.705730087056381e-05, "loss": 0.7813, "step": 2458 }, { "epoch": 0.19144567174346747, "grad_norm": 2.3409728540246344, "learning_rate": 3.705469396092694e-05, "loss": 0.7416, "step": 2459 }, { "epoch": 0.19152352683567708, "grad_norm": 2.267396095641167, "learning_rate": 3.70520859888628e-05, "loss": 0.7085, "step": 2460 }, { "epoch": 0.19160138192788673, "grad_norm": 2.5112710690067064, "learning_rate": 3.7049476954533855e-05, "loss": 0.7644, "step": 2461 }, { "epoch": 0.19167923702009634, "grad_norm": 2.1828396195830013, "learning_rate": 3.704686685810264e-05, "loss": 0.7122, "step": 2462 }, { "epoch": 0.19175709211230596, "grad_norm": 2.5514329053364313, "learning_rate": 3.7044255699731747e-05, "loss": 0.8238, "step": 2463 }, { "epoch": 0.1918349472045156, "grad_norm": 2.2085410124074984, "learning_rate": 3.7041643479583826e-05, "loss": 0.6948, "step": 2464 }, { "epoch": 0.19191280229672522, "grad_norm": 2.1712135796136534, "learning_rate": 3.7039030197821635e-05, "loss": 0.6984, "step": 2465 }, { "epoch": 0.19199065738893484, "grad_norm": 2.2681025552654415, "learning_rate": 3.703641585460794e-05, "loss": 0.7221, "step": 2466 }, { "epoch": 0.19206851248114448, "grad_norm": 2.2560553628036053, "learning_rate": 3.703380045010562e-05, "loss": 0.6812, "step": 2467 }, { "epoch": 0.1921463675733541, "grad_norm": 2.3328539763185843, "learning_rate": 3.70311839844776e-05, "loss": 0.743, "step": 2468 }, { "epoch": 0.19222422266556372, "grad_norm": 2.33801931168022, "learning_rate": 3.7028566457886875e-05, "loss": 0.7114, "step": 2469 }, { "epoch": 0.19230207775777333, "grad_norm": 2.5153409017164616, "learning_rate": 3.7025947870496494e-05, "loss": 0.7801, "step": 2470 }, { "epoch": 0.19237993284998298, "grad_norm": 2.4248958373724347, "learning_rate": 3.70233282224696e-05, "loss": 0.7259, "step": 2471 }, { "epoch": 0.1924577879421926, "grad_norm": 2.475521927500963, "learning_rate": 3.702070751396936e-05, "loss": 0.7187, "step": 2472 }, { "epoch": 0.1925356430344022, "grad_norm": 2.270512777949964, "learning_rate": 3.701808574515906e-05, "loss": 0.7291, "step": 2473 }, { "epoch": 0.19261349812661185, "grad_norm": 2.4556054010539934, "learning_rate": 3.7015462916202e-05, "loss": 0.7893, "step": 2474 }, { "epoch": 0.19269135321882147, "grad_norm": 2.3736573584729106, "learning_rate": 3.7012839027261585e-05, "loss": 0.7355, "step": 2475 }, { "epoch": 0.1927692083110311, "grad_norm": 2.1993796551145435, "learning_rate": 3.7010214078501264e-05, "loss": 0.6497, "step": 2476 }, { "epoch": 0.19284706340324073, "grad_norm": 2.3899445391909877, "learning_rate": 3.700758807008455e-05, "loss": 0.8036, "step": 2477 }, { "epoch": 0.19292491849545035, "grad_norm": 2.182545828938881, "learning_rate": 3.700496100217506e-05, "loss": 0.6831, "step": 2478 }, { "epoch": 0.19300277358765996, "grad_norm": 2.4597560347615923, "learning_rate": 3.7002332874936426e-05, "loss": 0.7445, "step": 2479 }, { "epoch": 0.19308062867986958, "grad_norm": 2.1404821205346014, "learning_rate": 3.699970368853237e-05, "loss": 0.6674, "step": 2480 }, { "epoch": 0.19315848377207923, "grad_norm": 2.436209790160414, "learning_rate": 3.6997073443126673e-05, "loss": 0.7594, "step": 2481 }, { "epoch": 0.19323633886428884, "grad_norm": 2.4251649320205866, "learning_rate": 3.69944421388832e-05, "loss": 0.7841, "step": 2482 }, { "epoch": 0.19331419395649846, "grad_norm": 2.4376174267093123, "learning_rate": 3.6991809775965865e-05, "loss": 0.7595, "step": 2483 }, { "epoch": 0.1933920490487081, "grad_norm": 2.379252491924595, "learning_rate": 3.6989176354538646e-05, "loss": 0.7528, "step": 2484 }, { "epoch": 0.19346990414091772, "grad_norm": 2.254210420281008, "learning_rate": 3.69865418747656e-05, "loss": 0.6846, "step": 2485 }, { "epoch": 0.19354775923312734, "grad_norm": 2.11714030966829, "learning_rate": 3.698390633681084e-05, "loss": 0.7028, "step": 2486 }, { "epoch": 0.19362561432533695, "grad_norm": 2.2735672394871185, "learning_rate": 3.698126974083854e-05, "loss": 0.7419, "step": 2487 }, { "epoch": 0.1937034694175466, "grad_norm": 2.155338594791153, "learning_rate": 3.6978632087012957e-05, "loss": 0.6375, "step": 2488 }, { "epoch": 0.19378132450975621, "grad_norm": 2.24214043398443, "learning_rate": 3.69759933754984e-05, "loss": 0.758, "step": 2489 }, { "epoch": 0.19385917960196583, "grad_norm": 2.475896790481823, "learning_rate": 3.697335360645926e-05, "loss": 0.7396, "step": 2490 }, { "epoch": 0.19393703469417548, "grad_norm": 2.073234165514887, "learning_rate": 3.697071278005996e-05, "loss": 0.6637, "step": 2491 }, { "epoch": 0.1940148897863851, "grad_norm": 2.281393079744592, "learning_rate": 3.6968070896465025e-05, "loss": 0.7651, "step": 2492 }, { "epoch": 0.1940927448785947, "grad_norm": 2.343297437415946, "learning_rate": 3.696542795583903e-05, "loss": 0.721, "step": 2493 }, { "epoch": 0.19417059997080435, "grad_norm": 2.1690015539601526, "learning_rate": 3.696278395834662e-05, "loss": 0.6498, "step": 2494 }, { "epoch": 0.19424845506301397, "grad_norm": 2.4107705264270463, "learning_rate": 3.69601389041525e-05, "loss": 0.7071, "step": 2495 }, { "epoch": 0.19432631015522359, "grad_norm": 2.39185287020114, "learning_rate": 3.695749279342145e-05, "loss": 0.7645, "step": 2496 }, { "epoch": 0.1944041652474332, "grad_norm": 2.2976904232877855, "learning_rate": 3.695484562631831e-05, "loss": 0.7249, "step": 2497 }, { "epoch": 0.19448202033964285, "grad_norm": 2.1716166714454173, "learning_rate": 3.695219740300797e-05, "loss": 0.6505, "step": 2498 }, { "epoch": 0.19455987543185246, "grad_norm": 2.2598406815839747, "learning_rate": 3.694954812365542e-05, "loss": 0.6972, "step": 2499 }, { "epoch": 0.19463773052406208, "grad_norm": 2.383177320329559, "learning_rate": 3.6946897788425696e-05, "loss": 0.7497, "step": 2500 }, { "epoch": 0.19463773052406208, "eval_loss": 0.08918396383523941, "eval_runtime": 162.1244, "eval_samples_per_second": 17.764, "eval_steps_per_second": 0.635, "step": 2500 }, { "epoch": 0.19471558561627172, "grad_norm": 2.2594495562600474, "learning_rate": 3.694424639748389e-05, "loss": 0.7135, "step": 2501 }, { "epoch": 0.19479344070848134, "grad_norm": 2.2847894947375216, "learning_rate": 3.694159395099518e-05, "loss": 0.7413, "step": 2502 }, { "epoch": 0.19487129580069096, "grad_norm": 2.3088853190476484, "learning_rate": 3.69389404491248e-05, "loss": 0.724, "step": 2503 }, { "epoch": 0.1949491508929006, "grad_norm": 2.4024997261422265, "learning_rate": 3.693628589203806e-05, "loss": 0.7487, "step": 2504 }, { "epoch": 0.19502700598511022, "grad_norm": 2.307967205833228, "learning_rate": 3.69336302799003e-05, "loss": 0.7277, "step": 2505 }, { "epoch": 0.19510486107731984, "grad_norm": 2.27225216435528, "learning_rate": 3.693097361287698e-05, "loss": 0.6259, "step": 2506 }, { "epoch": 0.19518271616952945, "grad_norm": 2.2443197689241874, "learning_rate": 3.692831589113359e-05, "loss": 0.6818, "step": 2507 }, { "epoch": 0.1952605712617391, "grad_norm": 2.3520622156015936, "learning_rate": 3.692565711483568e-05, "loss": 0.763, "step": 2508 }, { "epoch": 0.1953384263539487, "grad_norm": 2.644321159626011, "learning_rate": 3.692299728414889e-05, "loss": 0.7455, "step": 2509 }, { "epoch": 0.19541628144615833, "grad_norm": 2.269302370802242, "learning_rate": 3.6920336399238915e-05, "loss": 0.6967, "step": 2510 }, { "epoch": 0.19549413653836797, "grad_norm": 2.471204855648829, "learning_rate": 3.691767446027151e-05, "loss": 0.7592, "step": 2511 }, { "epoch": 0.1955719916305776, "grad_norm": 2.3898094396145946, "learning_rate": 3.691501146741252e-05, "loss": 0.6988, "step": 2512 }, { "epoch": 0.1956498467227872, "grad_norm": 2.312830721659989, "learning_rate": 3.691234742082781e-05, "loss": 0.7155, "step": 2513 }, { "epoch": 0.19572770181499682, "grad_norm": 2.2403459334710427, "learning_rate": 3.690968232068336e-05, "loss": 0.718, "step": 2514 }, { "epoch": 0.19580555690720647, "grad_norm": 2.444258441676263, "learning_rate": 3.690701616714518e-05, "loss": 0.687, "step": 2515 }, { "epoch": 0.19588341199941608, "grad_norm": 2.053745731837685, "learning_rate": 3.690434896037936e-05, "loss": 0.7179, "step": 2516 }, { "epoch": 0.1959612670916257, "grad_norm": 2.1298086334147524, "learning_rate": 3.690168070055206e-05, "loss": 0.663, "step": 2517 }, { "epoch": 0.19603912218383535, "grad_norm": 2.3612928592271403, "learning_rate": 3.68990113878295e-05, "loss": 0.7222, "step": 2518 }, { "epoch": 0.19611697727604496, "grad_norm": 2.451944788373121, "learning_rate": 3.689634102237796e-05, "loss": 0.7217, "step": 2519 }, { "epoch": 0.19619483236825458, "grad_norm": 2.322521439821023, "learning_rate": 3.68936696043638e-05, "loss": 0.738, "step": 2520 }, { "epoch": 0.19627268746046422, "grad_norm": 2.25805562382499, "learning_rate": 3.689099713395343e-05, "loss": 0.6546, "step": 2521 }, { "epoch": 0.19635054255267384, "grad_norm": 2.2242478941405985, "learning_rate": 3.688832361131332e-05, "loss": 0.6607, "step": 2522 }, { "epoch": 0.19642839764488346, "grad_norm": 2.267834458736767, "learning_rate": 3.6885649036610043e-05, "loss": 0.7335, "step": 2523 }, { "epoch": 0.19650625273709307, "grad_norm": 2.329219298367133, "learning_rate": 3.68829734100102e-05, "loss": 0.7493, "step": 2524 }, { "epoch": 0.19658410782930272, "grad_norm": 2.2083211384638815, "learning_rate": 3.6880296731680466e-05, "loss": 0.7187, "step": 2525 }, { "epoch": 0.19666196292151233, "grad_norm": 2.250194382401991, "learning_rate": 3.687761900178759e-05, "loss": 0.7169, "step": 2526 }, { "epoch": 0.19673981801372195, "grad_norm": 2.2723229278392054, "learning_rate": 3.687494022049839e-05, "loss": 0.7519, "step": 2527 }, { "epoch": 0.1968176731059316, "grad_norm": 2.321771146232444, "learning_rate": 3.687226038797973e-05, "loss": 0.6558, "step": 2528 }, { "epoch": 0.1968955281981412, "grad_norm": 2.2281559206027826, "learning_rate": 3.6869579504398545e-05, "loss": 0.6975, "step": 2529 }, { "epoch": 0.19697338329035083, "grad_norm": 2.104100472847719, "learning_rate": 3.686689756992186e-05, "loss": 0.6472, "step": 2530 }, { "epoch": 0.19705123838256047, "grad_norm": 2.2528179945560325, "learning_rate": 3.6864214584716734e-05, "loss": 0.6645, "step": 2531 }, { "epoch": 0.1971290934747701, "grad_norm": 2.118105715388837, "learning_rate": 3.686153054895031e-05, "loss": 0.6585, "step": 2532 }, { "epoch": 0.1972069485669797, "grad_norm": 2.3250194500988117, "learning_rate": 3.6858845462789797e-05, "loss": 0.7273, "step": 2533 }, { "epoch": 0.19728480365918932, "grad_norm": 2.247939026159367, "learning_rate": 3.685615932640244e-05, "loss": 0.6802, "step": 2534 }, { "epoch": 0.19736265875139897, "grad_norm": 2.2291569461866585, "learning_rate": 3.68534721399556e-05, "loss": 0.7362, "step": 2535 }, { "epoch": 0.19744051384360858, "grad_norm": 2.142955828480689, "learning_rate": 3.6850783903616655e-05, "loss": 0.7208, "step": 2536 }, { "epoch": 0.1975183689358182, "grad_norm": 2.0484443603509526, "learning_rate": 3.684809461755309e-05, "loss": 0.6522, "step": 2537 }, { "epoch": 0.19759622402802784, "grad_norm": 2.27460517832965, "learning_rate": 3.684540428193241e-05, "loss": 0.7049, "step": 2538 }, { "epoch": 0.19767407912023746, "grad_norm": 2.2333435929281245, "learning_rate": 3.684271289692223e-05, "loss": 0.7158, "step": 2539 }, { "epoch": 0.19775193421244708, "grad_norm": 2.304335608100827, "learning_rate": 3.6840020462690203e-05, "loss": 0.7073, "step": 2540 }, { "epoch": 0.1978297893046567, "grad_norm": 2.0847313677653205, "learning_rate": 3.683732697940406e-05, "loss": 0.7079, "step": 2541 }, { "epoch": 0.19790764439686634, "grad_norm": 2.0799709376390476, "learning_rate": 3.683463244723159e-05, "loss": 0.6672, "step": 2542 }, { "epoch": 0.19798549948907596, "grad_norm": 2.3253626692766116, "learning_rate": 3.683193686634064e-05, "loss": 0.7446, "step": 2543 }, { "epoch": 0.19806335458128557, "grad_norm": 2.267267322884709, "learning_rate": 3.682924023689914e-05, "loss": 0.6923, "step": 2544 }, { "epoch": 0.19814120967349522, "grad_norm": 2.169470147418849, "learning_rate": 3.682654255907509e-05, "loss": 0.6925, "step": 2545 }, { "epoch": 0.19821906476570483, "grad_norm": 2.387419478004806, "learning_rate": 3.682384383303652e-05, "loss": 0.7412, "step": 2546 }, { "epoch": 0.19829691985791445, "grad_norm": 2.176117752748696, "learning_rate": 3.682114405895156e-05, "loss": 0.6989, "step": 2547 }, { "epoch": 0.1983747749501241, "grad_norm": 2.56479677724233, "learning_rate": 3.68184432369884e-05, "loss": 0.7371, "step": 2548 }, { "epoch": 0.1984526300423337, "grad_norm": 2.238130681136998, "learning_rate": 3.681574136731526e-05, "loss": 0.705, "step": 2549 }, { "epoch": 0.19853048513454333, "grad_norm": 2.308441128033792, "learning_rate": 3.681303845010049e-05, "loss": 0.6858, "step": 2550 }, { "epoch": 0.19853048513454333, "eval_loss": 0.08725877106189728, "eval_runtime": 162.2175, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 2550 }, { "epoch": 0.19860834022675294, "grad_norm": 2.168412284448642, "learning_rate": 3.681033448551245e-05, "loss": 0.7557, "step": 2551 }, { "epoch": 0.1986861953189626, "grad_norm": 2.4405821452666006, "learning_rate": 3.6807629473719584e-05, "loss": 0.6994, "step": 2552 }, { "epoch": 0.1987640504111722, "grad_norm": 2.2056053635125985, "learning_rate": 3.680492341489041e-05, "loss": 0.6512, "step": 2553 }, { "epoch": 0.19884190550338182, "grad_norm": 2.310054802939469, "learning_rate": 3.6802216309193496e-05, "loss": 0.732, "step": 2554 }, { "epoch": 0.19891976059559147, "grad_norm": 2.239072618022695, "learning_rate": 3.679950815679748e-05, "loss": 0.7056, "step": 2555 }, { "epoch": 0.19899761568780108, "grad_norm": 2.4170267648481927, "learning_rate": 3.679679895787107e-05, "loss": 0.7318, "step": 2556 }, { "epoch": 0.1990754707800107, "grad_norm": 2.263373026369559, "learning_rate": 3.6794088712583036e-05, "loss": 0.7046, "step": 2557 }, { "epoch": 0.19915332587222034, "grad_norm": 2.2675317426623978, "learning_rate": 3.679137742110222e-05, "loss": 0.7382, "step": 2558 }, { "epoch": 0.19923118096442996, "grad_norm": 2.18993059152864, "learning_rate": 3.678866508359751e-05, "loss": 0.7127, "step": 2559 }, { "epoch": 0.19930903605663958, "grad_norm": 2.306199802641627, "learning_rate": 3.6785951700237884e-05, "loss": 0.7112, "step": 2560 }, { "epoch": 0.1993868911488492, "grad_norm": 2.267768663660777, "learning_rate": 3.678323727119237e-05, "loss": 0.6993, "step": 2561 }, { "epoch": 0.19946474624105884, "grad_norm": 2.372884175629649, "learning_rate": 3.678052179663006e-05, "loss": 0.7121, "step": 2562 }, { "epoch": 0.19954260133326845, "grad_norm": 2.3080528051971965, "learning_rate": 3.677780527672011e-05, "loss": 0.6964, "step": 2563 }, { "epoch": 0.19962045642547807, "grad_norm": 2.240264006794626, "learning_rate": 3.677508771163177e-05, "loss": 0.7232, "step": 2564 }, { "epoch": 0.19969831151768772, "grad_norm": 2.329559188158234, "learning_rate": 3.67723691015343e-05, "loss": 0.7055, "step": 2565 }, { "epoch": 0.19977616660989733, "grad_norm": 2.174565382176542, "learning_rate": 3.676964944659708e-05, "loss": 0.6755, "step": 2566 }, { "epoch": 0.19985402170210695, "grad_norm": 2.1569187810595265, "learning_rate": 3.6766928746989525e-05, "loss": 0.6391, "step": 2567 }, { "epoch": 0.19993187679431657, "grad_norm": 2.57572955249789, "learning_rate": 3.6764207002881113e-05, "loss": 0.7284, "step": 2568 }, { "epoch": 0.2000097318865262, "grad_norm": 2.385514523475645, "learning_rate": 3.6761484214441413e-05, "loss": 0.6857, "step": 2569 }, { "epoch": 0.20008758697873583, "grad_norm": 2.305768090223911, "learning_rate": 3.675876038184003e-05, "loss": 0.674, "step": 2570 }, { "epoch": 0.20016544207094544, "grad_norm": 2.369381374353853, "learning_rate": 3.675603550524664e-05, "loss": 0.714, "step": 2571 }, { "epoch": 0.2002432971631551, "grad_norm": 2.184145979046535, "learning_rate": 3.6753309584831e-05, "loss": 0.6904, "step": 2572 }, { "epoch": 0.2003211522553647, "grad_norm": 2.226326438602375, "learning_rate": 3.675058262076293e-05, "loss": 0.6551, "step": 2573 }, { "epoch": 0.20039900734757432, "grad_norm": 2.169622811069855, "learning_rate": 3.6747854613212296e-05, "loss": 0.6814, "step": 2574 }, { "epoch": 0.20047686243978396, "grad_norm": 2.186397701358365, "learning_rate": 3.674512556234903e-05, "loss": 0.6748, "step": 2575 }, { "epoch": 0.20055471753199358, "grad_norm": 2.1413596072827303, "learning_rate": 3.674239546834316e-05, "loss": 0.6618, "step": 2576 }, { "epoch": 0.2006325726242032, "grad_norm": 2.1608543302843395, "learning_rate": 3.673966433136475e-05, "loss": 0.6804, "step": 2577 }, { "epoch": 0.20071042771641281, "grad_norm": 2.2815055390381866, "learning_rate": 3.673693215158393e-05, "loss": 0.7046, "step": 2578 }, { "epoch": 0.20078828280862246, "grad_norm": 2.300150348564277, "learning_rate": 3.673419892917091e-05, "loss": 0.6983, "step": 2579 }, { "epoch": 0.20086613790083208, "grad_norm": 2.3185382274991424, "learning_rate": 3.673146466429595e-05, "loss": 0.727, "step": 2580 }, { "epoch": 0.2009439929930417, "grad_norm": 2.2329593440002435, "learning_rate": 3.672872935712939e-05, "loss": 0.6588, "step": 2581 }, { "epoch": 0.20102184808525134, "grad_norm": 2.1656828234974044, "learning_rate": 3.672599300784162e-05, "loss": 0.6236, "step": 2582 }, { "epoch": 0.20109970317746095, "grad_norm": 2.4039275360317194, "learning_rate": 3.6723255616603114e-05, "loss": 0.6762, "step": 2583 }, { "epoch": 0.20117755826967057, "grad_norm": 2.302771271777721, "learning_rate": 3.6720517183584385e-05, "loss": 0.7129, "step": 2584 }, { "epoch": 0.20125541336188021, "grad_norm": 2.3373698420718108, "learning_rate": 3.6717777708956026e-05, "loss": 0.6651, "step": 2585 }, { "epoch": 0.20133326845408983, "grad_norm": 2.450032531187847, "learning_rate": 3.67150371928887e-05, "loss": 0.6536, "step": 2586 }, { "epoch": 0.20141112354629945, "grad_norm": 2.2934884275180805, "learning_rate": 3.671229563555312e-05, "loss": 0.6794, "step": 2587 }, { "epoch": 0.20148897863850906, "grad_norm": 2.180512236490829, "learning_rate": 3.6709553037120084e-05, "loss": 0.675, "step": 2588 }, { "epoch": 0.2015668337307187, "grad_norm": 2.1647801965175857, "learning_rate": 3.670680939776042e-05, "loss": 0.7013, "step": 2589 }, { "epoch": 0.20164468882292833, "grad_norm": 2.18495319548682, "learning_rate": 3.6704064717645075e-05, "loss": 0.6408, "step": 2590 }, { "epoch": 0.20172254391513794, "grad_norm": 2.163599688322347, "learning_rate": 3.670131899694501e-05, "loss": 0.6739, "step": 2591 }, { "epoch": 0.2018003990073476, "grad_norm": 2.1483429800171208, "learning_rate": 3.669857223583127e-05, "loss": 0.6293, "step": 2592 }, { "epoch": 0.2018782540995572, "grad_norm": 2.2781280072443986, "learning_rate": 3.6695824434474973e-05, "loss": 0.6954, "step": 2593 }, { "epoch": 0.20195610919176682, "grad_norm": 2.1438560708479093, "learning_rate": 3.6693075593047287e-05, "loss": 0.6448, "step": 2594 }, { "epoch": 0.20203396428397644, "grad_norm": 2.079276060596638, "learning_rate": 3.669032571171946e-05, "loss": 0.6199, "step": 2595 }, { "epoch": 0.20211181937618608, "grad_norm": 2.244068465214079, "learning_rate": 3.6687574790662796e-05, "loss": 0.6687, "step": 2596 }, { "epoch": 0.2021896744683957, "grad_norm": 2.215165403437808, "learning_rate": 3.668482283004866e-05, "loss": 0.6689, "step": 2597 }, { "epoch": 0.2022675295606053, "grad_norm": 2.2219161085717265, "learning_rate": 3.668206983004848e-05, "loss": 0.6624, "step": 2598 }, { "epoch": 0.20234538465281496, "grad_norm": 2.222810556723596, "learning_rate": 3.667931579083377e-05, "loss": 0.6473, "step": 2599 }, { "epoch": 0.20242323974502457, "grad_norm": 2.1166605297364818, "learning_rate": 3.667656071257608e-05, "loss": 0.665, "step": 2600 }, { "epoch": 0.20242323974502457, "eval_loss": 0.08488881587982178, "eval_runtime": 162.3388, "eval_samples_per_second": 17.741, "eval_steps_per_second": 0.634, "step": 2600 }, { "epoch": 0.2025010948372342, "grad_norm": 2.2405329272248915, "learning_rate": 3.667380459544705e-05, "loss": 0.6512, "step": 2601 }, { "epoch": 0.20257894992944384, "grad_norm": 2.354909696681756, "learning_rate": 3.6671047439618364e-05, "loss": 0.6488, "step": 2602 }, { "epoch": 0.20265680502165345, "grad_norm": 2.3679854361998127, "learning_rate": 3.666828924526179e-05, "loss": 0.7186, "step": 2603 }, { "epoch": 0.20273466011386307, "grad_norm": 2.1065607664805004, "learning_rate": 3.666553001254913e-05, "loss": 0.6891, "step": 2604 }, { "epoch": 0.20281251520607269, "grad_norm": 2.2849551625874724, "learning_rate": 3.66627697416523e-05, "loss": 0.6913, "step": 2605 }, { "epoch": 0.20289037029828233, "grad_norm": 2.1761740290294234, "learning_rate": 3.666000843274323e-05, "loss": 0.665, "step": 2606 }, { "epoch": 0.20296822539049195, "grad_norm": 2.386832854108763, "learning_rate": 3.665724608599394e-05, "loss": 0.6268, "step": 2607 }, { "epoch": 0.20304608048270156, "grad_norm": 2.2698498188290324, "learning_rate": 3.665448270157652e-05, "loss": 0.6834, "step": 2608 }, { "epoch": 0.2031239355749112, "grad_norm": 2.1044893367166924, "learning_rate": 3.6651718279663116e-05, "loss": 0.6996, "step": 2609 }, { "epoch": 0.20320179066712082, "grad_norm": 2.3555290930873554, "learning_rate": 3.6648952820425924e-05, "loss": 0.6463, "step": 2610 }, { "epoch": 0.20327964575933044, "grad_norm": 2.1471949901403153, "learning_rate": 3.6646186324037235e-05, "loss": 0.5981, "step": 2611 }, { "epoch": 0.20335750085154006, "grad_norm": 2.342635154271115, "learning_rate": 3.6643418790669374e-05, "loss": 0.6778, "step": 2612 }, { "epoch": 0.2034353559437497, "grad_norm": 2.1854436419920993, "learning_rate": 3.664065022049476e-05, "loss": 0.6748, "step": 2613 }, { "epoch": 0.20351321103595932, "grad_norm": 2.155749827949858, "learning_rate": 3.663788061368585e-05, "loss": 0.6268, "step": 2614 }, { "epoch": 0.20359106612816893, "grad_norm": 2.4141531631610116, "learning_rate": 3.6635109970415184e-05, "loss": 0.7065, "step": 2615 }, { "epoch": 0.20366892122037858, "grad_norm": 2.11581779162378, "learning_rate": 3.663233829085536e-05, "loss": 0.6578, "step": 2616 }, { "epoch": 0.2037467763125882, "grad_norm": 1.9932190256904307, "learning_rate": 3.662956557517904e-05, "loss": 0.591, "step": 2617 }, { "epoch": 0.2038246314047978, "grad_norm": 2.5122630134755553, "learning_rate": 3.662679182355895e-05, "loss": 0.72, "step": 2618 }, { "epoch": 0.20390248649700746, "grad_norm": 2.2837045277984953, "learning_rate": 3.662401703616788e-05, "loss": 0.6704, "step": 2619 }, { "epoch": 0.20398034158921707, "grad_norm": 2.1253876860307646, "learning_rate": 3.662124121317869e-05, "loss": 0.6618, "step": 2620 }, { "epoch": 0.2040581966814267, "grad_norm": 2.341056351173188, "learning_rate": 3.6618464354764303e-05, "loss": 0.6593, "step": 2621 }, { "epoch": 0.2041360517736363, "grad_norm": 2.4390352832599818, "learning_rate": 3.66156864610977e-05, "loss": 0.6787, "step": 2622 }, { "epoch": 0.20421390686584595, "grad_norm": 2.3896199441524146, "learning_rate": 3.661290753235193e-05, "loss": 0.722, "step": 2623 }, { "epoch": 0.20429176195805557, "grad_norm": 2.3871863607477284, "learning_rate": 3.661012756870011e-05, "loss": 0.6712, "step": 2624 }, { "epoch": 0.20436961705026518, "grad_norm": 2.392908506933896, "learning_rate": 3.6607346570315405e-05, "loss": 0.6803, "step": 2625 }, { "epoch": 0.20444747214247483, "grad_norm": 2.322569378325248, "learning_rate": 3.660456453737108e-05, "loss": 0.6619, "step": 2626 }, { "epoch": 0.20452532723468445, "grad_norm": 2.2847306397412215, "learning_rate": 3.660178147004043e-05, "loss": 0.6916, "step": 2627 }, { "epoch": 0.20460318232689406, "grad_norm": 2.2737526314286223, "learning_rate": 3.659899736849683e-05, "loss": 0.6948, "step": 2628 }, { "epoch": 0.2046810374191037, "grad_norm": 2.2537547455706464, "learning_rate": 3.659621223291372e-05, "loss": 0.6913, "step": 2629 }, { "epoch": 0.20475889251131332, "grad_norm": 2.4265268320707154, "learning_rate": 3.659342606346459e-05, "loss": 0.756, "step": 2630 }, { "epoch": 0.20483674760352294, "grad_norm": 2.2642837746920903, "learning_rate": 3.6590638860323015e-05, "loss": 0.6364, "step": 2631 }, { "epoch": 0.20491460269573256, "grad_norm": 2.3399534438753364, "learning_rate": 3.6587850623662626e-05, "loss": 0.691, "step": 2632 }, { "epoch": 0.2049924577879422, "grad_norm": 2.1298860865335003, "learning_rate": 3.6585061353657116e-05, "loss": 0.6505, "step": 2633 }, { "epoch": 0.20507031288015182, "grad_norm": 2.3076773440438165, "learning_rate": 3.6582271050480234e-05, "loss": 0.608, "step": 2634 }, { "epoch": 0.20514816797236143, "grad_norm": 2.5366694431964585, "learning_rate": 3.6579479714305805e-05, "loss": 0.7119, "step": 2635 }, { "epoch": 0.20522602306457108, "grad_norm": 2.2411924932754217, "learning_rate": 3.657668734530773e-05, "loss": 0.671, "step": 2636 }, { "epoch": 0.2053038781567807, "grad_norm": 2.40479390919453, "learning_rate": 3.6573893943659945e-05, "loss": 0.7206, "step": 2637 }, { "epoch": 0.2053817332489903, "grad_norm": 2.282925160339309, "learning_rate": 3.657109950953647e-05, "loss": 0.7083, "step": 2638 }, { "epoch": 0.20545958834119993, "grad_norm": 2.1843550311088067, "learning_rate": 3.6568304043111386e-05, "loss": 0.6765, "step": 2639 }, { "epoch": 0.20553744343340957, "grad_norm": 2.352058261403471, "learning_rate": 3.656550754455884e-05, "loss": 0.7098, "step": 2640 }, { "epoch": 0.2056152985256192, "grad_norm": 2.3089909915975806, "learning_rate": 3.656271001405304e-05, "loss": 0.6509, "step": 2641 }, { "epoch": 0.2056931536178288, "grad_norm": 2.1199184421011004, "learning_rate": 3.655991145176826e-05, "loss": 0.6358, "step": 2642 }, { "epoch": 0.20577100871003845, "grad_norm": 2.3665699268971645, "learning_rate": 3.655711185787883e-05, "loss": 0.6948, "step": 2643 }, { "epoch": 0.20584886380224807, "grad_norm": 2.4016428492613295, "learning_rate": 3.6554311232559164e-05, "loss": 0.6483, "step": 2644 }, { "epoch": 0.20592671889445768, "grad_norm": 2.366899739294207, "learning_rate": 3.655150957598371e-05, "loss": 0.6628, "step": 2645 }, { "epoch": 0.20600457398666733, "grad_norm": 2.164458856652298, "learning_rate": 3.654870688832701e-05, "loss": 0.6569, "step": 2646 }, { "epoch": 0.20608242907887694, "grad_norm": 2.121467716657185, "learning_rate": 3.654590316976366e-05, "loss": 0.6477, "step": 2647 }, { "epoch": 0.20616028417108656, "grad_norm": 2.2243390704777166, "learning_rate": 3.654309842046832e-05, "loss": 0.6186, "step": 2648 }, { "epoch": 0.20623813926329618, "grad_norm": 2.2957549779909794, "learning_rate": 3.6540292640615705e-05, "loss": 0.6592, "step": 2649 }, { "epoch": 0.20631599435550582, "grad_norm": 2.3433648470701867, "learning_rate": 3.6537485830380604e-05, "loss": 0.7213, "step": 2650 }, { "epoch": 0.20631599435550582, "eval_loss": 0.08168687671422958, "eval_runtime": 162.7303, "eval_samples_per_second": 17.698, "eval_steps_per_second": 0.633, "step": 2650 }, { "epoch": 0.20639384944771544, "grad_norm": 2.1818477862384236, "learning_rate": 3.653467798993786e-05, "loss": 0.604, "step": 2651 }, { "epoch": 0.20647170453992506, "grad_norm": 2.2501853178722597, "learning_rate": 3.653186911946241e-05, "loss": 0.6607, "step": 2652 }, { "epoch": 0.2065495596321347, "grad_norm": 2.1130604097254864, "learning_rate": 3.6529059219129216e-05, "loss": 0.6323, "step": 2653 }, { "epoch": 0.20662741472434432, "grad_norm": 2.142396832388386, "learning_rate": 3.6526248289113325e-05, "loss": 0.6213, "step": 2654 }, { "epoch": 0.20670526981655393, "grad_norm": 2.386672360739649, "learning_rate": 3.6523436329589846e-05, "loss": 0.6801, "step": 2655 }, { "epoch": 0.20678312490876358, "grad_norm": 2.4109395389454784, "learning_rate": 3.652062334073395e-05, "loss": 0.6553, "step": 2656 }, { "epoch": 0.2068609800009732, "grad_norm": 2.319075171011834, "learning_rate": 3.651780932272088e-05, "loss": 0.7028, "step": 2657 }, { "epoch": 0.2069388350931828, "grad_norm": 2.3960364271348973, "learning_rate": 3.6514994275725916e-05, "loss": 0.6271, "step": 2658 }, { "epoch": 0.20701669018539243, "grad_norm": 2.1727932595070314, "learning_rate": 3.651217819992445e-05, "loss": 0.5849, "step": 2659 }, { "epoch": 0.20709454527760207, "grad_norm": 2.230945178567531, "learning_rate": 3.6509361095491885e-05, "loss": 0.641, "step": 2660 }, { "epoch": 0.2071724003698117, "grad_norm": 2.236092838901786, "learning_rate": 3.650654296260373e-05, "loss": 0.6249, "step": 2661 }, { "epoch": 0.2072502554620213, "grad_norm": 2.121819527627656, "learning_rate": 3.6503723801435534e-05, "loss": 0.5882, "step": 2662 }, { "epoch": 0.20732811055423095, "grad_norm": 2.2812512578780453, "learning_rate": 3.6500903612162925e-05, "loss": 0.661, "step": 2663 }, { "epoch": 0.20740596564644057, "grad_norm": 2.2508395342432057, "learning_rate": 3.6498082394961576e-05, "loss": 0.6435, "step": 2664 }, { "epoch": 0.20748382073865018, "grad_norm": 2.2563891210438, "learning_rate": 3.6495260150007245e-05, "loss": 0.6393, "step": 2665 }, { "epoch": 0.2075616758308598, "grad_norm": 2.1286474576884844, "learning_rate": 3.649243687747574e-05, "loss": 0.6235, "step": 2666 }, { "epoch": 0.20763953092306944, "grad_norm": 2.098017910376364, "learning_rate": 3.648961257754294e-05, "loss": 0.6189, "step": 2667 }, { "epoch": 0.20771738601527906, "grad_norm": 2.1090305674241177, "learning_rate": 3.6486787250384785e-05, "loss": 0.6394, "step": 2668 }, { "epoch": 0.20779524110748868, "grad_norm": 2.232425701485665, "learning_rate": 3.648396089617728e-05, "loss": 0.6428, "step": 2669 }, { "epoch": 0.20787309619969832, "grad_norm": 2.20256446103717, "learning_rate": 3.6481133515096486e-05, "loss": 0.6366, "step": 2670 }, { "epoch": 0.20795095129190794, "grad_norm": 2.1574484438809316, "learning_rate": 3.647830510731855e-05, "loss": 0.6463, "step": 2671 }, { "epoch": 0.20802880638411755, "grad_norm": 2.2416750351478076, "learning_rate": 3.6475475673019656e-05, "loss": 0.6405, "step": 2672 }, { "epoch": 0.2081066614763272, "grad_norm": 2.2218729347337813, "learning_rate": 3.647264521237607e-05, "loss": 0.6602, "step": 2673 }, { "epoch": 0.20818451656853681, "grad_norm": 2.214707789999453, "learning_rate": 3.6469813725564116e-05, "loss": 0.6359, "step": 2674 }, { "epoch": 0.20826237166074643, "grad_norm": 2.235189395400937, "learning_rate": 3.6466981212760185e-05, "loss": 0.6401, "step": 2675 }, { "epoch": 0.20834022675295605, "grad_norm": 2.2548587241345905, "learning_rate": 3.646414767414072e-05, "loss": 0.6478, "step": 2676 }, { "epoch": 0.2084180818451657, "grad_norm": 2.154170803510431, "learning_rate": 3.646131310988224e-05, "loss": 0.6544, "step": 2677 }, { "epoch": 0.2084959369373753, "grad_norm": 2.222706855444404, "learning_rate": 3.645847752016134e-05, "loss": 0.6376, "step": 2678 }, { "epoch": 0.20857379202958493, "grad_norm": 2.3841691395389977, "learning_rate": 3.645564090515465e-05, "loss": 0.6314, "step": 2679 }, { "epoch": 0.20865164712179457, "grad_norm": 2.44702836681307, "learning_rate": 3.645280326503887e-05, "loss": 0.6739, "step": 2680 }, { "epoch": 0.2087295022140042, "grad_norm": 2.5456878923698594, "learning_rate": 3.644996459999079e-05, "loss": 0.7019, "step": 2681 }, { "epoch": 0.2088073573062138, "grad_norm": 2.284497413632225, "learning_rate": 3.644712491018724e-05, "loss": 0.6497, "step": 2682 }, { "epoch": 0.20888521239842345, "grad_norm": 2.1454942684496205, "learning_rate": 3.6444284195805105e-05, "loss": 0.5667, "step": 2683 }, { "epoch": 0.20896306749063306, "grad_norm": 2.1625969674268872, "learning_rate": 3.644144245702137e-05, "loss": 0.6596, "step": 2684 }, { "epoch": 0.20904092258284268, "grad_norm": 2.5247638779280765, "learning_rate": 3.643859969401305e-05, "loss": 0.7375, "step": 2685 }, { "epoch": 0.2091187776750523, "grad_norm": 2.1134944000997584, "learning_rate": 3.643575590695723e-05, "loss": 0.6035, "step": 2686 }, { "epoch": 0.20919663276726194, "grad_norm": 2.21439505014663, "learning_rate": 3.643291109603108e-05, "loss": 0.6571, "step": 2687 }, { "epoch": 0.20927448785947156, "grad_norm": 2.035881935040208, "learning_rate": 3.64300652614118e-05, "loss": 0.5476, "step": 2688 }, { "epoch": 0.20935234295168118, "grad_norm": 2.350731504899691, "learning_rate": 3.64272184032767e-05, "loss": 0.6369, "step": 2689 }, { "epoch": 0.20943019804389082, "grad_norm": 2.2262865691259313, "learning_rate": 3.6424370521803095e-05, "loss": 0.6346, "step": 2690 }, { "epoch": 0.20950805313610044, "grad_norm": 2.1957045852888615, "learning_rate": 3.6421521617168406e-05, "loss": 0.6311, "step": 2691 }, { "epoch": 0.20958590822831005, "grad_norm": 2.15806064571295, "learning_rate": 3.641867168955011e-05, "loss": 0.6011, "step": 2692 }, { "epoch": 0.20966376332051967, "grad_norm": 2.3953014553102134, "learning_rate": 3.6415820739125753e-05, "loss": 0.6781, "step": 2693 }, { "epoch": 0.2097416184127293, "grad_norm": 2.0990407543868552, "learning_rate": 3.641296876607292e-05, "loss": 0.6117, "step": 2694 }, { "epoch": 0.20981947350493893, "grad_norm": 2.173134523757305, "learning_rate": 3.641011577056927e-05, "loss": 0.5739, "step": 2695 }, { "epoch": 0.20989732859714855, "grad_norm": 2.1473163865673026, "learning_rate": 3.6407261752792556e-05, "loss": 0.6192, "step": 2696 }, { "epoch": 0.2099751836893582, "grad_norm": 2.2752173290727677, "learning_rate": 3.640440671292054e-05, "loss": 0.6448, "step": 2697 }, { "epoch": 0.2100530387815678, "grad_norm": 2.187089150194623, "learning_rate": 3.640155065113111e-05, "loss": 0.631, "step": 2698 }, { "epoch": 0.21013089387377742, "grad_norm": 2.3731925932863964, "learning_rate": 3.639869356760216e-05, "loss": 0.6843, "step": 2699 }, { "epoch": 0.21020874896598707, "grad_norm": 2.208453916029509, "learning_rate": 3.639583546251168e-05, "loss": 0.6514, "step": 2700 }, { "epoch": 0.21020874896598707, "eval_loss": 0.08020205795764923, "eval_runtime": 162.364, "eval_samples_per_second": 17.738, "eval_steps_per_second": 0.634, "step": 2700 }, { "epoch": 0.21028660405819669, "grad_norm": 2.255515240066994, "learning_rate": 3.6392976336037726e-05, "loss": 0.6617, "step": 2701 }, { "epoch": 0.2103644591504063, "grad_norm": 2.29173576441237, "learning_rate": 3.6390116188358396e-05, "loss": 0.657, "step": 2702 }, { "epoch": 0.21044231424261592, "grad_norm": 2.359622191302602, "learning_rate": 3.638725501965186e-05, "loss": 0.7108, "step": 2703 }, { "epoch": 0.21052016933482556, "grad_norm": 2.1403029786305647, "learning_rate": 3.6384392830096376e-05, "loss": 0.6168, "step": 2704 }, { "epoch": 0.21059802442703518, "grad_norm": 2.24309885272348, "learning_rate": 3.638152961987022e-05, "loss": 0.5869, "step": 2705 }, { "epoch": 0.2106758795192448, "grad_norm": 2.268233885155282, "learning_rate": 3.637866538915177e-05, "loss": 0.6535, "step": 2706 }, { "epoch": 0.21075373461145444, "grad_norm": 2.181107270002268, "learning_rate": 3.637580013811946e-05, "loss": 0.6292, "step": 2707 }, { "epoch": 0.21083158970366406, "grad_norm": 2.277495210123071, "learning_rate": 3.637293386695177e-05, "loss": 0.6304, "step": 2708 }, { "epoch": 0.21090944479587367, "grad_norm": 2.0706353592040383, "learning_rate": 3.637006657582727e-05, "loss": 0.5893, "step": 2709 }, { "epoch": 0.21098729988808332, "grad_norm": 2.348739727482185, "learning_rate": 3.636719826492456e-05, "loss": 0.7165, "step": 2710 }, { "epoch": 0.21106515498029293, "grad_norm": 2.2418029842774057, "learning_rate": 3.636432893442233e-05, "loss": 0.6632, "step": 2711 }, { "epoch": 0.21114301007250255, "grad_norm": 2.224021619570616, "learning_rate": 3.636145858449932e-05, "loss": 0.6802, "step": 2712 }, { "epoch": 0.21122086516471217, "grad_norm": 2.0894408678931495, "learning_rate": 3.6358587215334355e-05, "loss": 0.6128, "step": 2713 }, { "epoch": 0.2112987202569218, "grad_norm": 2.059325619858888, "learning_rate": 3.6355714827106294e-05, "loss": 0.6111, "step": 2714 }, { "epoch": 0.21137657534913143, "grad_norm": 2.1673500365143084, "learning_rate": 3.6352841419994085e-05, "loss": 0.6454, "step": 2715 }, { "epoch": 0.21145443044134105, "grad_norm": 2.199335321558253, "learning_rate": 3.6349966994176715e-05, "loss": 0.5547, "step": 2716 }, { "epoch": 0.2115322855335507, "grad_norm": 2.1360054581714225, "learning_rate": 3.6347091549833255e-05, "loss": 0.6114, "step": 2717 }, { "epoch": 0.2116101406257603, "grad_norm": 2.1603820085819496, "learning_rate": 3.634421508714283e-05, "loss": 0.6239, "step": 2718 }, { "epoch": 0.21168799571796992, "grad_norm": 2.1646211035728746, "learning_rate": 3.6341337606284633e-05, "loss": 0.6268, "step": 2719 }, { "epoch": 0.21176585081017954, "grad_norm": 2.4185942509886784, "learning_rate": 3.63384591074379e-05, "loss": 0.6537, "step": 2720 }, { "epoch": 0.21184370590238918, "grad_norm": 2.238173384001653, "learning_rate": 3.6335579590781975e-05, "loss": 0.6879, "step": 2721 }, { "epoch": 0.2119215609945988, "grad_norm": 2.19621064373978, "learning_rate": 3.633269905649622e-05, "loss": 0.6229, "step": 2722 }, { "epoch": 0.21199941608680842, "grad_norm": 2.1850872263021475, "learning_rate": 3.6329817504760085e-05, "loss": 0.6702, "step": 2723 }, { "epoch": 0.21207727117901806, "grad_norm": 2.4479842399551655, "learning_rate": 3.632693493575308e-05, "loss": 0.6561, "step": 2724 }, { "epoch": 0.21215512627122768, "grad_norm": 2.1175396182290136, "learning_rate": 3.6324051349654766e-05, "loss": 0.5846, "step": 2725 }, { "epoch": 0.2122329813634373, "grad_norm": 2.1796091749129447, "learning_rate": 3.632116674664479e-05, "loss": 0.6078, "step": 2726 }, { "epoch": 0.21231083645564694, "grad_norm": 2.0109286460763762, "learning_rate": 3.631828112690284e-05, "loss": 0.5711, "step": 2727 }, { "epoch": 0.21238869154785656, "grad_norm": 1.996534446921915, "learning_rate": 3.631539449060868e-05, "loss": 0.5694, "step": 2728 }, { "epoch": 0.21246654664006617, "grad_norm": 2.362955131784937, "learning_rate": 3.631250683794212e-05, "loss": 0.6435, "step": 2729 }, { "epoch": 0.2125444017322758, "grad_norm": 2.2183925272266096, "learning_rate": 3.6309618169083075e-05, "loss": 0.586, "step": 2730 }, { "epoch": 0.21262225682448543, "grad_norm": 2.1706071668703104, "learning_rate": 3.630672848421147e-05, "loss": 0.5891, "step": 2731 }, { "epoch": 0.21270011191669505, "grad_norm": 2.220143533507963, "learning_rate": 3.630383778350734e-05, "loss": 0.6286, "step": 2732 }, { "epoch": 0.21277796700890467, "grad_norm": 2.6145962768694795, "learning_rate": 3.6300946067150735e-05, "loss": 0.725, "step": 2733 }, { "epoch": 0.2128558221011143, "grad_norm": 2.2518747973591817, "learning_rate": 3.629805333532182e-05, "loss": 0.6509, "step": 2734 }, { "epoch": 0.21293367719332393, "grad_norm": 2.157584472214685, "learning_rate": 3.629515958820079e-05, "loss": 0.6499, "step": 2735 }, { "epoch": 0.21301153228553354, "grad_norm": 2.2915743355961853, "learning_rate": 3.6292264825967915e-05, "loss": 0.6406, "step": 2736 }, { "epoch": 0.2130893873777432, "grad_norm": 2.271086046069393, "learning_rate": 3.628936904880352e-05, "loss": 0.5834, "step": 2737 }, { "epoch": 0.2131672424699528, "grad_norm": 2.192589235353972, "learning_rate": 3.628647225688799e-05, "loss": 0.6396, "step": 2738 }, { "epoch": 0.21324509756216242, "grad_norm": 2.4054371794986777, "learning_rate": 3.62835744504018e-05, "loss": 0.6639, "step": 2739 }, { "epoch": 0.21332295265437204, "grad_norm": 2.2651292495906254, "learning_rate": 3.628067562952546e-05, "loss": 0.6258, "step": 2740 }, { "epoch": 0.21340080774658168, "grad_norm": 2.283070084582102, "learning_rate": 3.627777579443956e-05, "loss": 0.6978, "step": 2741 }, { "epoch": 0.2134786628387913, "grad_norm": 2.4051353316793307, "learning_rate": 3.6274874945324734e-05, "loss": 0.6603, "step": 2742 }, { "epoch": 0.21355651793100092, "grad_norm": 2.3507321455980756, "learning_rate": 3.62719730823617e-05, "loss": 0.6668, "step": 2743 }, { "epoch": 0.21363437302321056, "grad_norm": 2.2120555024635755, "learning_rate": 3.6269070205731224e-05, "loss": 0.6114, "step": 2744 }, { "epoch": 0.21371222811542018, "grad_norm": 2.4600947102393964, "learning_rate": 3.6266166315614155e-05, "loss": 0.6652, "step": 2745 }, { "epoch": 0.2137900832076298, "grad_norm": 2.3207507473783764, "learning_rate": 3.6263261412191374e-05, "loss": 0.6908, "step": 2746 }, { "epoch": 0.2138679382998394, "grad_norm": 2.2774616175910016, "learning_rate": 3.6260355495643856e-05, "loss": 0.632, "step": 2747 }, { "epoch": 0.21394579339204906, "grad_norm": 2.18121173205088, "learning_rate": 3.6257448566152615e-05, "loss": 0.5985, "step": 2748 }, { "epoch": 0.21402364848425867, "grad_norm": 2.1668716946708204, "learning_rate": 3.6254540623898746e-05, "loss": 0.6086, "step": 2749 }, { "epoch": 0.2141015035764683, "grad_norm": 2.283254847071064, "learning_rate": 3.6251631669063405e-05, "loss": 0.6248, "step": 2750 }, { "epoch": 0.2141015035764683, "eval_loss": 0.07729082554578781, "eval_runtime": 162.4957, "eval_samples_per_second": 17.724, "eval_steps_per_second": 0.634, "step": 2750 }, { "epoch": 0.21417935866867793, "grad_norm": 2.392840136534934, "learning_rate": 3.624872170182779e-05, "loss": 0.6916, "step": 2751 }, { "epoch": 0.21425721376088755, "grad_norm": 2.3150830461252982, "learning_rate": 3.6245810722373196e-05, "loss": 0.6618, "step": 2752 }, { "epoch": 0.21433506885309717, "grad_norm": 2.2951566071878533, "learning_rate": 3.624289873088096e-05, "loss": 0.6607, "step": 2753 }, { "epoch": 0.2144129239453068, "grad_norm": 2.231274884094942, "learning_rate": 3.623998572753247e-05, "loss": 0.661, "step": 2754 }, { "epoch": 0.21449077903751643, "grad_norm": 2.1577054254409753, "learning_rate": 3.623707171250922e-05, "loss": 0.6307, "step": 2755 }, { "epoch": 0.21456863412972604, "grad_norm": 2.2389089209538175, "learning_rate": 3.623415668599271e-05, "loss": 0.5759, "step": 2756 }, { "epoch": 0.21464648922193566, "grad_norm": 2.0154688806439736, "learning_rate": 3.623124064816454e-05, "loss": 0.5789, "step": 2757 }, { "epoch": 0.2147243443141453, "grad_norm": 2.2044068151441834, "learning_rate": 3.622832359920639e-05, "loss": 0.6435, "step": 2758 }, { "epoch": 0.21480219940635492, "grad_norm": 2.3959723039747756, "learning_rate": 3.622540553929995e-05, "loss": 0.6916, "step": 2759 }, { "epoch": 0.21488005449856454, "grad_norm": 2.273982173579362, "learning_rate": 3.622248646862701e-05, "loss": 0.6253, "step": 2760 }, { "epoch": 0.21495790959077418, "grad_norm": 2.139145001878698, "learning_rate": 3.621956638736942e-05, "loss": 0.6232, "step": 2761 }, { "epoch": 0.2150357646829838, "grad_norm": 2.147887166662121, "learning_rate": 3.621664529570908e-05, "loss": 0.6067, "step": 2762 }, { "epoch": 0.21511361977519342, "grad_norm": 2.1801900521411075, "learning_rate": 3.6213723193827964e-05, "loss": 0.6192, "step": 2763 }, { "epoch": 0.21519147486740306, "grad_norm": 2.231562727728395, "learning_rate": 3.62108000819081e-05, "loss": 0.6185, "step": 2764 }, { "epoch": 0.21526932995961268, "grad_norm": 2.234630798782647, "learning_rate": 3.620787596013159e-05, "loss": 0.6467, "step": 2765 }, { "epoch": 0.2153471850518223, "grad_norm": 2.2388930041515507, "learning_rate": 3.6204950828680596e-05, "loss": 0.6533, "step": 2766 }, { "epoch": 0.2154250401440319, "grad_norm": 2.243723554252883, "learning_rate": 3.620202468773733e-05, "loss": 0.5883, "step": 2767 }, { "epoch": 0.21550289523624155, "grad_norm": 2.0766896857342805, "learning_rate": 3.619909753748408e-05, "loss": 0.5871, "step": 2768 }, { "epoch": 0.21558075032845117, "grad_norm": 2.0912867754125597, "learning_rate": 3.619616937810321e-05, "loss": 0.5731, "step": 2769 }, { "epoch": 0.2156586054206608, "grad_norm": 2.185157341680534, "learning_rate": 3.61932402097771e-05, "loss": 0.5979, "step": 2770 }, { "epoch": 0.21573646051287043, "grad_norm": 2.3568831046935927, "learning_rate": 3.6190310032688254e-05, "loss": 0.6242, "step": 2771 }, { "epoch": 0.21581431560508005, "grad_norm": 2.2258613267850347, "learning_rate": 3.618737884701918e-05, "loss": 0.6034, "step": 2772 }, { "epoch": 0.21589217069728966, "grad_norm": 2.1674465671630596, "learning_rate": 3.6184446652952496e-05, "loss": 0.6767, "step": 2773 }, { "epoch": 0.21597002578949928, "grad_norm": 2.2329181614158506, "learning_rate": 3.618151345067087e-05, "loss": 0.584, "step": 2774 }, { "epoch": 0.21604788088170893, "grad_norm": 2.1647420896889553, "learning_rate": 3.6178579240357e-05, "loss": 0.6154, "step": 2775 }, { "epoch": 0.21612573597391854, "grad_norm": 2.2452237174421956, "learning_rate": 3.61756440221937e-05, "loss": 0.6066, "step": 2776 }, { "epoch": 0.21620359106612816, "grad_norm": 2.0269952963840314, "learning_rate": 3.61727077963638e-05, "loss": 0.6066, "step": 2777 }, { "epoch": 0.2162814461583378, "grad_norm": 2.1579728893824797, "learning_rate": 3.6169770563050236e-05, "loss": 0.5473, "step": 2778 }, { "epoch": 0.21635930125054742, "grad_norm": 2.19013207399773, "learning_rate": 3.616683232243596e-05, "loss": 0.5988, "step": 2779 }, { "epoch": 0.21643715634275704, "grad_norm": 2.320407229213769, "learning_rate": 3.616389307470402e-05, "loss": 0.6504, "step": 2780 }, { "epoch": 0.21651501143496668, "grad_norm": 2.063274774831317, "learning_rate": 3.616095282003753e-05, "loss": 0.5647, "step": 2781 }, { "epoch": 0.2165928665271763, "grad_norm": 2.2058247660225394, "learning_rate": 3.615801155861963e-05, "loss": 0.6295, "step": 2782 }, { "epoch": 0.21667072161938591, "grad_norm": 2.110457190655032, "learning_rate": 3.615506929063357e-05, "loss": 0.5701, "step": 2783 }, { "epoch": 0.21674857671159553, "grad_norm": 2.1907583859830795, "learning_rate": 3.615212601626262e-05, "loss": 0.6155, "step": 2784 }, { "epoch": 0.21682643180380518, "grad_norm": 2.1328062795129266, "learning_rate": 3.6149181735690145e-05, "loss": 0.5873, "step": 2785 }, { "epoch": 0.2169042868960148, "grad_norm": 2.3285542273273068, "learning_rate": 3.6146236449099553e-05, "loss": 0.6402, "step": 2786 }, { "epoch": 0.2169821419882244, "grad_norm": 2.06417793237476, "learning_rate": 3.6143290156674325e-05, "loss": 0.6011, "step": 2787 }, { "epoch": 0.21705999708043405, "grad_norm": 2.128601629431748, "learning_rate": 3.6140342858598e-05, "loss": 0.5789, "step": 2788 }, { "epoch": 0.21713785217264367, "grad_norm": 2.133692988918294, "learning_rate": 3.6137394555054186e-05, "loss": 0.5685, "step": 2789 }, { "epoch": 0.21721570726485329, "grad_norm": 2.3283843712203818, "learning_rate": 3.613444524622654e-05, "loss": 0.61, "step": 2790 }, { "epoch": 0.21729356235706293, "grad_norm": 2.1945817804788854, "learning_rate": 3.6131494932298795e-05, "loss": 0.5919, "step": 2791 }, { "epoch": 0.21737141744927255, "grad_norm": 1.9977223709595182, "learning_rate": 3.6128543613454744e-05, "loss": 0.5405, "step": 2792 }, { "epoch": 0.21744927254148216, "grad_norm": 2.1525421819813277, "learning_rate": 3.612559128987823e-05, "loss": 0.6512, "step": 2793 }, { "epoch": 0.21752712763369178, "grad_norm": 2.0197394289502997, "learning_rate": 3.612263796175318e-05, "loss": 0.5665, "step": 2794 }, { "epoch": 0.21760498272590142, "grad_norm": 2.2937839271364897, "learning_rate": 3.611968362926357e-05, "loss": 0.6327, "step": 2795 }, { "epoch": 0.21768283781811104, "grad_norm": 2.1689315523436186, "learning_rate": 3.6116728292593426e-05, "loss": 0.6265, "step": 2796 }, { "epoch": 0.21776069291032066, "grad_norm": 2.1396112472499316, "learning_rate": 3.611377195192688e-05, "loss": 0.6129, "step": 2797 }, { "epoch": 0.2178385480025303, "grad_norm": 2.029986627245041, "learning_rate": 3.611081460744808e-05, "loss": 0.5554, "step": 2798 }, { "epoch": 0.21791640309473992, "grad_norm": 2.207699327805603, "learning_rate": 3.610785625934125e-05, "loss": 0.5918, "step": 2799 }, { "epoch": 0.21799425818694954, "grad_norm": 2.0025859402227355, "learning_rate": 3.610489690779069e-05, "loss": 0.5491, "step": 2800 }, { "epoch": 0.21799425818694954, "eval_loss": 0.07479032874107361, "eval_runtime": 162.815, "eval_samples_per_second": 17.689, "eval_steps_per_second": 0.633, "step": 2800 }, { "epoch": 0.21807211327915915, "grad_norm": 2.2401931370784665, "learning_rate": 3.6101936552980756e-05, "loss": 0.5726, "step": 2801 }, { "epoch": 0.2181499683713688, "grad_norm": 2.1069652103638594, "learning_rate": 3.609897519509586e-05, "loss": 0.5957, "step": 2802 }, { "epoch": 0.2182278234635784, "grad_norm": 2.306566003863182, "learning_rate": 3.609601283432048e-05, "loss": 0.6234, "step": 2803 }, { "epoch": 0.21830567855578803, "grad_norm": 1.9816874948441867, "learning_rate": 3.609304947083916e-05, "loss": 0.5457, "step": 2804 }, { "epoch": 0.21838353364799767, "grad_norm": 1.9714362458912351, "learning_rate": 3.60900851048365e-05, "loss": 0.5891, "step": 2805 }, { "epoch": 0.2184613887402073, "grad_norm": 2.1018293496717737, "learning_rate": 3.6087119736497175e-05, "loss": 0.5814, "step": 2806 }, { "epoch": 0.2185392438324169, "grad_norm": 2.1792519649465247, "learning_rate": 3.60841533660059e-05, "loss": 0.594, "step": 2807 }, { "epoch": 0.21861709892462655, "grad_norm": 2.1814447436457014, "learning_rate": 3.608118599354747e-05, "loss": 0.5534, "step": 2808 }, { "epoch": 0.21869495401683617, "grad_norm": 2.000452272335793, "learning_rate": 3.6078217619306746e-05, "loss": 0.577, "step": 2809 }, { "epoch": 0.21877280910904578, "grad_norm": 2.248650200446355, "learning_rate": 3.607524824346864e-05, "loss": 0.6083, "step": 2810 }, { "epoch": 0.2188506642012554, "grad_norm": 2.275393173965042, "learning_rate": 3.607227786621812e-05, "loss": 0.5915, "step": 2811 }, { "epoch": 0.21892851929346505, "grad_norm": 2.536769249716465, "learning_rate": 3.6069306487740247e-05, "loss": 0.6946, "step": 2812 }, { "epoch": 0.21900637438567466, "grad_norm": 2.0443539014352217, "learning_rate": 3.60663341082201e-05, "loss": 0.572, "step": 2813 }, { "epoch": 0.21908422947788428, "grad_norm": 2.490584514382769, "learning_rate": 3.6063360727842864e-05, "loss": 0.6603, "step": 2814 }, { "epoch": 0.21916208457009392, "grad_norm": 2.190827311434075, "learning_rate": 3.606038634679376e-05, "loss": 0.6058, "step": 2815 }, { "epoch": 0.21923993966230354, "grad_norm": 2.045987799321565, "learning_rate": 3.605741096525807e-05, "loss": 0.5505, "step": 2816 }, { "epoch": 0.21931779475451316, "grad_norm": 2.251620492850248, "learning_rate": 3.605443458342116e-05, "loss": 0.6144, "step": 2817 }, { "epoch": 0.21939564984672277, "grad_norm": 2.214501571390148, "learning_rate": 3.605145720146844e-05, "loss": 0.5823, "step": 2818 }, { "epoch": 0.21947350493893242, "grad_norm": 2.113448018979011, "learning_rate": 3.604847881958538e-05, "loss": 0.5563, "step": 2819 }, { "epoch": 0.21955136003114203, "grad_norm": 2.372593109356277, "learning_rate": 3.604549943795753e-05, "loss": 0.621, "step": 2820 }, { "epoch": 0.21962921512335165, "grad_norm": 2.19639072554273, "learning_rate": 3.604251905677048e-05, "loss": 0.5973, "step": 2821 }, { "epoch": 0.2197070702155613, "grad_norm": 2.2699079075940394, "learning_rate": 3.6039537676209896e-05, "loss": 0.6167, "step": 2822 }, { "epoch": 0.2197849253077709, "grad_norm": 2.219902961248079, "learning_rate": 3.603655529646151e-05, "loss": 0.5924, "step": 2823 }, { "epoch": 0.21986278039998053, "grad_norm": 2.1366061694547263, "learning_rate": 3.603357191771111e-05, "loss": 0.6072, "step": 2824 }, { "epoch": 0.21994063549219017, "grad_norm": 2.1727810401074614, "learning_rate": 3.603058754014454e-05, "loss": 0.6188, "step": 2825 }, { "epoch": 0.2200184905843998, "grad_norm": 2.212422829178849, "learning_rate": 3.602760216394772e-05, "loss": 0.6395, "step": 2826 }, { "epoch": 0.2200963456766094, "grad_norm": 2.20460567274556, "learning_rate": 3.602461578930662e-05, "loss": 0.6208, "step": 2827 }, { "epoch": 0.22017420076881902, "grad_norm": 2.113787649033345, "learning_rate": 3.6021628416407276e-05, "loss": 0.5871, "step": 2828 }, { "epoch": 0.22025205586102867, "grad_norm": 2.0476720770649193, "learning_rate": 3.60186400454358e-05, "loss": 0.5913, "step": 2829 }, { "epoch": 0.22032991095323828, "grad_norm": 2.081867111796023, "learning_rate": 3.601565067657833e-05, "loss": 0.613, "step": 2830 }, { "epoch": 0.2204077660454479, "grad_norm": 2.263116923433606, "learning_rate": 3.601266031002112e-05, "loss": 0.6035, "step": 2831 }, { "epoch": 0.22048562113765754, "grad_norm": 2.187707805904477, "learning_rate": 3.6009668945950425e-05, "loss": 0.5604, "step": 2832 }, { "epoch": 0.22056347622986716, "grad_norm": 2.1806786890696195, "learning_rate": 3.600667658455261e-05, "loss": 0.5915, "step": 2833 }, { "epoch": 0.22064133132207678, "grad_norm": 1.9646528794691647, "learning_rate": 3.600368322601409e-05, "loss": 0.5604, "step": 2834 }, { "epoch": 0.22071918641428642, "grad_norm": 2.0642997614580545, "learning_rate": 3.6000688870521316e-05, "loss": 0.555, "step": 2835 }, { "epoch": 0.22079704150649604, "grad_norm": 2.1316929084385365, "learning_rate": 3.599769351826084e-05, "loss": 0.5328, "step": 2836 }, { "epoch": 0.22087489659870566, "grad_norm": 2.218805046912425, "learning_rate": 3.5994697169419256e-05, "loss": 0.6015, "step": 2837 }, { "epoch": 0.22095275169091527, "grad_norm": 2.0895088630038092, "learning_rate": 3.599169982418322e-05, "loss": 0.5466, "step": 2838 }, { "epoch": 0.22103060678312492, "grad_norm": 2.161795230678846, "learning_rate": 3.5988701482739444e-05, "loss": 0.6423, "step": 2839 }, { "epoch": 0.22110846187533453, "grad_norm": 2.157411565961273, "learning_rate": 3.598570214527473e-05, "loss": 0.6101, "step": 2840 }, { "epoch": 0.22118631696754415, "grad_norm": 2.123277203659554, "learning_rate": 3.598270181197591e-05, "loss": 0.5899, "step": 2841 }, { "epoch": 0.2212641720597538, "grad_norm": 2.129165907814313, "learning_rate": 3.5979700483029886e-05, "loss": 0.5466, "step": 2842 }, { "epoch": 0.2213420271519634, "grad_norm": 2.0302920646415443, "learning_rate": 3.597669815862364e-05, "loss": 0.5601, "step": 2843 }, { "epoch": 0.22141988224417303, "grad_norm": 2.3019764253890043, "learning_rate": 3.597369483894419e-05, "loss": 0.6074, "step": 2844 }, { "epoch": 0.22149773733638264, "grad_norm": 2.2099223320444716, "learning_rate": 3.597069052417864e-05, "loss": 0.6191, "step": 2845 }, { "epoch": 0.2215755924285923, "grad_norm": 2.1201688814806348, "learning_rate": 3.596768521451414e-05, "loss": 0.6041, "step": 2846 }, { "epoch": 0.2216534475208019, "grad_norm": 2.25370688357783, "learning_rate": 3.5964678910137896e-05, "loss": 0.6382, "step": 2847 }, { "epoch": 0.22173130261301152, "grad_norm": 2.006861566683803, "learning_rate": 3.596167161123721e-05, "loss": 0.5745, "step": 2848 }, { "epoch": 0.22180915770522117, "grad_norm": 2.166483329243175, "learning_rate": 3.59586633179994e-05, "loss": 0.6144, "step": 2849 }, { "epoch": 0.22188701279743078, "grad_norm": 2.359671388561071, "learning_rate": 3.595565403061188e-05, "loss": 0.6655, "step": 2850 }, { "epoch": 0.22188701279743078, "eval_loss": 0.0735616683959961, "eval_runtime": 162.525, "eval_samples_per_second": 17.72, "eval_steps_per_second": 0.634, "step": 2850 }, { "epoch": 0.2219648678896404, "grad_norm": 2.2180242112067927, "learning_rate": 3.595264374926211e-05, "loss": 0.6458, "step": 2851 }, { "epoch": 0.22204272298185004, "grad_norm": 2.2474148651757697, "learning_rate": 3.594963247413763e-05, "loss": 0.6024, "step": 2852 }, { "epoch": 0.22212057807405966, "grad_norm": 2.138740834943698, "learning_rate": 3.5946620205426e-05, "loss": 0.5956, "step": 2853 }, { "epoch": 0.22219843316626928, "grad_norm": 2.044296110127925, "learning_rate": 3.594360694331489e-05, "loss": 0.5693, "step": 2854 }, { "epoch": 0.2222762882584789, "grad_norm": 2.1179182896382804, "learning_rate": 3.594059268799201e-05, "loss": 0.542, "step": 2855 }, { "epoch": 0.22235414335068854, "grad_norm": 2.227056878858992, "learning_rate": 3.593757743964514e-05, "loss": 0.6528, "step": 2856 }, { "epoch": 0.22243199844289815, "grad_norm": 2.2085094525474895, "learning_rate": 3.593456119846211e-05, "loss": 0.5825, "step": 2857 }, { "epoch": 0.22250985353510777, "grad_norm": 2.318797008613619, "learning_rate": 3.593154396463081e-05, "loss": 0.6512, "step": 2858 }, { "epoch": 0.22258770862731742, "grad_norm": 2.1240985831537555, "learning_rate": 3.592852573833921e-05, "loss": 0.584, "step": 2859 }, { "epoch": 0.22266556371952703, "grad_norm": 2.203427070761121, "learning_rate": 3.592550651977533e-05, "loss": 0.6152, "step": 2860 }, { "epoch": 0.22274341881173665, "grad_norm": 2.346084153926035, "learning_rate": 3.592248630912724e-05, "loss": 0.6568, "step": 2861 }, { "epoch": 0.2228212739039463, "grad_norm": 2.270730962529661, "learning_rate": 3.5919465106583096e-05, "loss": 0.6302, "step": 2862 }, { "epoch": 0.2228991289961559, "grad_norm": 2.325787247544664, "learning_rate": 3.5916442912331106e-05, "loss": 0.61, "step": 2863 }, { "epoch": 0.22297698408836553, "grad_norm": 2.26587087751919, "learning_rate": 3.591341972655954e-05, "loss": 0.6272, "step": 2864 }, { "epoch": 0.22305483918057514, "grad_norm": 2.307738196630018, "learning_rate": 3.591039554945672e-05, "loss": 0.5499, "step": 2865 }, { "epoch": 0.2231326942727848, "grad_norm": 2.157426626393692, "learning_rate": 3.590737038121105e-05, "loss": 0.644, "step": 2866 }, { "epoch": 0.2232105493649944, "grad_norm": 2.127900183614037, "learning_rate": 3.5904344222010976e-05, "loss": 0.56, "step": 2867 }, { "epoch": 0.22328840445720402, "grad_norm": 2.278282884047387, "learning_rate": 3.590131707204501e-05, "loss": 0.6319, "step": 2868 }, { "epoch": 0.22336625954941366, "grad_norm": 2.1683348611348636, "learning_rate": 3.589828893150174e-05, "loss": 0.6425, "step": 2869 }, { "epoch": 0.22344411464162328, "grad_norm": 2.412210411376832, "learning_rate": 3.5895259800569785e-05, "loss": 0.6588, "step": 2870 }, { "epoch": 0.2235219697338329, "grad_norm": 2.11887203360408, "learning_rate": 3.5892229679437864e-05, "loss": 0.6018, "step": 2871 }, { "epoch": 0.22359982482604251, "grad_norm": 2.2336825474689705, "learning_rate": 3.588919856829473e-05, "loss": 0.6009, "step": 2872 }, { "epoch": 0.22367767991825216, "grad_norm": 1.9237129971185836, "learning_rate": 3.588616646732922e-05, "loss": 0.5572, "step": 2873 }, { "epoch": 0.22375553501046178, "grad_norm": 2.2836203768171655, "learning_rate": 3.588313337673021e-05, "loss": 0.5961, "step": 2874 }, { "epoch": 0.2238333901026714, "grad_norm": 2.2783040029472303, "learning_rate": 3.5880099296686645e-05, "loss": 0.5497, "step": 2875 }, { "epoch": 0.22391124519488104, "grad_norm": 2.185063402168279, "learning_rate": 3.587706422738754e-05, "loss": 0.5756, "step": 2876 }, { "epoch": 0.22398910028709065, "grad_norm": 2.086434792166956, "learning_rate": 3.5874028169021954e-05, "loss": 0.575, "step": 2877 }, { "epoch": 0.22406695537930027, "grad_norm": 2.3609298363253486, "learning_rate": 3.5870991121779036e-05, "loss": 0.5898, "step": 2878 }, { "epoch": 0.22414481047150991, "grad_norm": 2.2917453439058653, "learning_rate": 3.586795308584797e-05, "loss": 0.5942, "step": 2879 }, { "epoch": 0.22422266556371953, "grad_norm": 2.3181414387908426, "learning_rate": 3.586491406141801e-05, "loss": 0.6335, "step": 2880 }, { "epoch": 0.22430052065592915, "grad_norm": 2.2024751134162295, "learning_rate": 3.586187404867848e-05, "loss": 0.5815, "step": 2881 }, { "epoch": 0.22437837574813876, "grad_norm": 2.0158163708835595, "learning_rate": 3.5858833047818743e-05, "loss": 0.5191, "step": 2882 }, { "epoch": 0.2244562308403484, "grad_norm": 2.244787925542103, "learning_rate": 3.585579105902826e-05, "loss": 0.585, "step": 2883 }, { "epoch": 0.22453408593255803, "grad_norm": 2.1995024102683685, "learning_rate": 3.5852748082496524e-05, "loss": 0.5616, "step": 2884 }, { "epoch": 0.22461194102476764, "grad_norm": 2.0154413693738373, "learning_rate": 3.584970411841309e-05, "loss": 0.5826, "step": 2885 }, { "epoch": 0.2246897961169773, "grad_norm": 2.092535370225304, "learning_rate": 3.58466591669676e-05, "loss": 0.5596, "step": 2886 }, { "epoch": 0.2247676512091869, "grad_norm": 2.0441465054327344, "learning_rate": 3.584361322834972e-05, "loss": 0.5666, "step": 2887 }, { "epoch": 0.22484550630139652, "grad_norm": 2.066047451396308, "learning_rate": 3.584056630274921e-05, "loss": 0.5515, "step": 2888 }, { "epoch": 0.22492336139360616, "grad_norm": 2.086446387053513, "learning_rate": 3.583751839035587e-05, "loss": 0.5194, "step": 2889 }, { "epoch": 0.22500121648581578, "grad_norm": 2.0788336799737794, "learning_rate": 3.583446949135959e-05, "loss": 0.5485, "step": 2890 }, { "epoch": 0.2250790715780254, "grad_norm": 2.132160240391773, "learning_rate": 3.583141960595028e-05, "loss": 0.6036, "step": 2891 }, { "epoch": 0.225156926670235, "grad_norm": 2.2413168655323905, "learning_rate": 3.582836873431795e-05, "loss": 0.5818, "step": 2892 }, { "epoch": 0.22523478176244466, "grad_norm": 2.320868259516175, "learning_rate": 3.582531687665265e-05, "loss": 0.6349, "step": 2893 }, { "epoch": 0.22531263685465427, "grad_norm": 1.946128948072504, "learning_rate": 3.582226403314448e-05, "loss": 0.4924, "step": 2894 }, { "epoch": 0.2253904919468639, "grad_norm": 2.151906532098918, "learning_rate": 3.5819210203983645e-05, "loss": 0.5918, "step": 2895 }, { "epoch": 0.22546834703907354, "grad_norm": 2.1082690912305377, "learning_rate": 3.5816155389360366e-05, "loss": 0.5383, "step": 2896 }, { "epoch": 0.22554620213128315, "grad_norm": 2.198392947952316, "learning_rate": 3.581309958946495e-05, "loss": 0.5968, "step": 2897 }, { "epoch": 0.22562405722349277, "grad_norm": 1.9687190307261626, "learning_rate": 3.5810042804487755e-05, "loss": 0.5128, "step": 2898 }, { "epoch": 0.22570191231570239, "grad_norm": 2.063322735505772, "learning_rate": 3.5806985034619216e-05, "loss": 0.5824, "step": 2899 }, { "epoch": 0.22577976740791203, "grad_norm": 2.230492790500706, "learning_rate": 3.5803926280049796e-05, "loss": 0.6131, "step": 2900 }, { "epoch": 0.22577976740791203, "eval_loss": 0.07080856710672379, "eval_runtime": 162.9977, "eval_samples_per_second": 17.669, "eval_steps_per_second": 0.632, "step": 2900 }, { "epoch": 0.22585762250012165, "grad_norm": 2.1663748052886103, "learning_rate": 3.5800866540970065e-05, "loss": 0.5827, "step": 2901 }, { "epoch": 0.22593547759233126, "grad_norm": 2.122494814260884, "learning_rate": 3.579780581757061e-05, "loss": 0.598, "step": 2902 }, { "epoch": 0.2260133326845409, "grad_norm": 2.1495979512416308, "learning_rate": 3.5794744110042114e-05, "loss": 0.5878, "step": 2903 }, { "epoch": 0.22609118777675052, "grad_norm": 2.09926158050775, "learning_rate": 3.5791681418575296e-05, "loss": 0.583, "step": 2904 }, { "epoch": 0.22616904286896014, "grad_norm": 2.161791921587083, "learning_rate": 3.578861774336096e-05, "loss": 0.5679, "step": 2905 }, { "epoch": 0.22624689796116979, "grad_norm": 2.210193550672058, "learning_rate": 3.578555308458995e-05, "loss": 0.5665, "step": 2906 }, { "epoch": 0.2263247530533794, "grad_norm": 2.05662505276072, "learning_rate": 3.578248744245318e-05, "loss": 0.5427, "step": 2907 }, { "epoch": 0.22640260814558902, "grad_norm": 2.046267992162217, "learning_rate": 3.577942081714163e-05, "loss": 0.5232, "step": 2908 }, { "epoch": 0.22648046323779863, "grad_norm": 2.091399904937276, "learning_rate": 3.577635320884633e-05, "loss": 0.569, "step": 2909 }, { "epoch": 0.22655831833000828, "grad_norm": 2.059987182436305, "learning_rate": 3.577328461775838e-05, "loss": 0.5619, "step": 2910 }, { "epoch": 0.2266361734222179, "grad_norm": 2.11352020849907, "learning_rate": 3.5770215044068935e-05, "loss": 0.5448, "step": 2911 }, { "epoch": 0.2267140285144275, "grad_norm": 1.9929223181358973, "learning_rate": 3.576714448796922e-05, "loss": 0.4915, "step": 2912 }, { "epoch": 0.22679188360663716, "grad_norm": 2.0932325593180487, "learning_rate": 3.576407294965052e-05, "loss": 0.5679, "step": 2913 }, { "epoch": 0.22686973869884677, "grad_norm": 2.1048103698769998, "learning_rate": 3.576100042930418e-05, "loss": 0.5661, "step": 2914 }, { "epoch": 0.2269475937910564, "grad_norm": 2.1406194761527146, "learning_rate": 3.575792692712159e-05, "loss": 0.5796, "step": 2915 }, { "epoch": 0.22702544888326603, "grad_norm": 2.0688002237371403, "learning_rate": 3.575485244329422e-05, "loss": 0.5336, "step": 2916 }, { "epoch": 0.22710330397547565, "grad_norm": 2.123202371693106, "learning_rate": 3.5751776978013595e-05, "loss": 0.5517, "step": 2917 }, { "epoch": 0.22718115906768527, "grad_norm": 2.204432572253171, "learning_rate": 3.574870053147131e-05, "loss": 0.5514, "step": 2918 }, { "epoch": 0.22725901415989488, "grad_norm": 2.23085667721873, "learning_rate": 3.5745623103859e-05, "loss": 0.5495, "step": 2919 }, { "epoch": 0.22733686925210453, "grad_norm": 2.1804526365365016, "learning_rate": 3.574254469536839e-05, "loss": 0.5722, "step": 2920 }, { "epoch": 0.22741472434431415, "grad_norm": 2.1557223711879088, "learning_rate": 3.573946530619123e-05, "loss": 0.5521, "step": 2921 }, { "epoch": 0.22749257943652376, "grad_norm": 1.9336328312258513, "learning_rate": 3.573638493651937e-05, "loss": 0.533, "step": 2922 }, { "epoch": 0.2275704345287334, "grad_norm": 2.1464607585671454, "learning_rate": 3.57333035865447e-05, "loss": 0.5769, "step": 2923 }, { "epoch": 0.22764828962094302, "grad_norm": 2.8756650151005054, "learning_rate": 3.573022125645917e-05, "loss": 0.6101, "step": 2924 }, { "epoch": 0.22772614471315264, "grad_norm": 2.0675758473353465, "learning_rate": 3.572713794645479e-05, "loss": 0.517, "step": 2925 }, { "epoch": 0.22780399980536226, "grad_norm": 2.197382529108388, "learning_rate": 3.572405365672364e-05, "loss": 0.6004, "step": 2926 }, { "epoch": 0.2278818548975719, "grad_norm": 2.0658846998853804, "learning_rate": 3.572096838745786e-05, "loss": 0.576, "step": 2927 }, { "epoch": 0.22795970998978152, "grad_norm": 2.0206089970971925, "learning_rate": 3.571788213884964e-05, "loss": 0.5711, "step": 2928 }, { "epoch": 0.22803756508199113, "grad_norm": 2.1442867842613733, "learning_rate": 3.5714794911091246e-05, "loss": 0.557, "step": 2929 }, { "epoch": 0.22811542017420078, "grad_norm": 2.1171231625665103, "learning_rate": 3.5711706704374996e-05, "loss": 0.5571, "step": 2930 }, { "epoch": 0.2281932752664104, "grad_norm": 2.094682979624095, "learning_rate": 3.570861751889327e-05, "loss": 0.5729, "step": 2931 }, { "epoch": 0.22827113035862, "grad_norm": 2.1358058797544968, "learning_rate": 3.5705527354838506e-05, "loss": 0.558, "step": 2932 }, { "epoch": 0.22834898545082966, "grad_norm": 2.0757884936543314, "learning_rate": 3.570243621240321e-05, "loss": 0.5604, "step": 2933 }, { "epoch": 0.22842684054303927, "grad_norm": 2.078481999894842, "learning_rate": 3.569934409177995e-05, "loss": 0.5505, "step": 2934 }, { "epoch": 0.2285046956352489, "grad_norm": 2.225486896633196, "learning_rate": 3.5696250993161346e-05, "loss": 0.6124, "step": 2935 }, { "epoch": 0.2285825507274585, "grad_norm": 2.0900124089789385, "learning_rate": 3.5693156916740085e-05, "loss": 0.5494, "step": 2936 }, { "epoch": 0.22866040581966815, "grad_norm": 2.095323766014082, "learning_rate": 3.5690061862708904e-05, "loss": 0.5706, "step": 2937 }, { "epoch": 0.22873826091187777, "grad_norm": 2.0789987150564855, "learning_rate": 3.568696583126063e-05, "loss": 0.5741, "step": 2938 }, { "epoch": 0.22881611600408738, "grad_norm": 1.91832775644004, "learning_rate": 3.5683868822588117e-05, "loss": 0.5157, "step": 2939 }, { "epoch": 0.22889397109629703, "grad_norm": 2.0520050020108473, "learning_rate": 3.56807708368843e-05, "loss": 0.5699, "step": 2940 }, { "epoch": 0.22897182618850664, "grad_norm": 1.9543156102005632, "learning_rate": 3.5677671874342156e-05, "loss": 0.5189, "step": 2941 }, { "epoch": 0.22904968128071626, "grad_norm": 2.1417092488181217, "learning_rate": 3.5674571935154745e-05, "loss": 0.5882, "step": 2942 }, { "epoch": 0.2291275363729259, "grad_norm": 2.1567956632001253, "learning_rate": 3.567147101951519e-05, "loss": 0.5442, "step": 2943 }, { "epoch": 0.22920539146513552, "grad_norm": 2.207597924033134, "learning_rate": 3.566836912761665e-05, "loss": 0.5798, "step": 2944 }, { "epoch": 0.22928324655734514, "grad_norm": 2.192890347508232, "learning_rate": 3.566526625965236e-05, "loss": 0.5651, "step": 2945 }, { "epoch": 0.22936110164955475, "grad_norm": 2.0475719712214153, "learning_rate": 3.566216241581561e-05, "loss": 0.5522, "step": 2946 }, { "epoch": 0.2294389567417644, "grad_norm": 2.164085132240565, "learning_rate": 3.5659057596299765e-05, "loss": 0.5092, "step": 2947 }, { "epoch": 0.22951681183397402, "grad_norm": 2.1147516394569754, "learning_rate": 3.5655951801298235e-05, "loss": 0.5707, "step": 2948 }, { "epoch": 0.22959466692618363, "grad_norm": 2.326802323664731, "learning_rate": 3.56528450310045e-05, "loss": 0.5994, "step": 2949 }, { "epoch": 0.22967252201839328, "grad_norm": 2.283057517751857, "learning_rate": 3.564973728561209e-05, "loss": 0.613, "step": 2950 }, { "epoch": 0.22967252201839328, "eval_loss": 0.06976208090782166, "eval_runtime": 162.7636, "eval_samples_per_second": 17.694, "eval_steps_per_second": 0.633, "step": 2950 }, { "epoch": 0.2297503771106029, "grad_norm": 2.1735593206474926, "learning_rate": 3.5646628565314616e-05, "loss": 0.5875, "step": 2951 }, { "epoch": 0.2298282322028125, "grad_norm": 2.1725070119987997, "learning_rate": 3.564351887030572e-05, "loss": 0.5449, "step": 2952 }, { "epoch": 0.22990608729502213, "grad_norm": 2.1625547230548237, "learning_rate": 3.564040820077913e-05, "loss": 0.5723, "step": 2953 }, { "epoch": 0.22998394238723177, "grad_norm": 2.0615958794730562, "learning_rate": 3.563729655692862e-05, "loss": 0.4848, "step": 2954 }, { "epoch": 0.2300617974794414, "grad_norm": 2.1574859066233323, "learning_rate": 3.563418393894805e-05, "loss": 0.534, "step": 2955 }, { "epoch": 0.230139652571651, "grad_norm": 1.9321319955559093, "learning_rate": 3.56310703470313e-05, "loss": 0.4828, "step": 2956 }, { "epoch": 0.23021750766386065, "grad_norm": 2.141849006600627, "learning_rate": 3.562795578137234e-05, "loss": 0.5569, "step": 2957 }, { "epoch": 0.23029536275607027, "grad_norm": 2.1037435794258497, "learning_rate": 3.5624840242165196e-05, "loss": 0.5427, "step": 2958 }, { "epoch": 0.23037321784827988, "grad_norm": 2.183631935392086, "learning_rate": 3.562172372960394e-05, "loss": 0.5455, "step": 2959 }, { "epoch": 0.23045107294048953, "grad_norm": 2.169086750431563, "learning_rate": 3.5618606243882735e-05, "loss": 0.5663, "step": 2960 }, { "epoch": 0.23052892803269914, "grad_norm": 2.202224546471869, "learning_rate": 3.5615487785195764e-05, "loss": 0.5442, "step": 2961 }, { "epoch": 0.23060678312490876, "grad_norm": 2.113765804096163, "learning_rate": 3.561236835373731e-05, "loss": 0.4838, "step": 2962 }, { "epoch": 0.23068463821711838, "grad_norm": 2.207259372854105, "learning_rate": 3.560924794970169e-05, "loss": 0.5458, "step": 2963 }, { "epoch": 0.23076249330932802, "grad_norm": 2.177155492622866, "learning_rate": 3.56061265732833e-05, "loss": 0.5647, "step": 2964 }, { "epoch": 0.23084034840153764, "grad_norm": 2.2467821842562063, "learning_rate": 3.560300422467657e-05, "loss": 0.5566, "step": 2965 }, { "epoch": 0.23091820349374725, "grad_norm": 2.100526310528166, "learning_rate": 3.559988090407602e-05, "loss": 0.5846, "step": 2966 }, { "epoch": 0.2309960585859569, "grad_norm": 2.1844884005547627, "learning_rate": 3.559675661167622e-05, "loss": 0.5316, "step": 2967 }, { "epoch": 0.23107391367816651, "grad_norm": 2.2633023934486967, "learning_rate": 3.5593631347671784e-05, "loss": 0.5672, "step": 2968 }, { "epoch": 0.23115176877037613, "grad_norm": 1.9270189224709424, "learning_rate": 3.5590505112257423e-05, "loss": 0.516, "step": 2969 }, { "epoch": 0.23122962386258578, "grad_norm": 2.240987064765136, "learning_rate": 3.558737790562787e-05, "loss": 0.5863, "step": 2970 }, { "epoch": 0.2313074789547954, "grad_norm": 2.17812433863029, "learning_rate": 3.5584249727977945e-05, "loss": 0.4989, "step": 2971 }, { "epoch": 0.231385334047005, "grad_norm": 2.0845191061027086, "learning_rate": 3.558112057950251e-05, "loss": 0.4994, "step": 2972 }, { "epoch": 0.23146318913921463, "grad_norm": 1.9969778842610781, "learning_rate": 3.55779904603965e-05, "loss": 0.5301, "step": 2973 }, { "epoch": 0.23154104423142427, "grad_norm": 2.165932102033047, "learning_rate": 3.557485937085491e-05, "loss": 0.5436, "step": 2974 }, { "epoch": 0.2316188993236339, "grad_norm": 2.4687740431372864, "learning_rate": 3.5571727311072786e-05, "loss": 0.5786, "step": 2975 }, { "epoch": 0.2316967544158435, "grad_norm": 2.0050841148571017, "learning_rate": 3.556859428124525e-05, "loss": 0.5053, "step": 2976 }, { "epoch": 0.23177460950805315, "grad_norm": 2.0514512062816803, "learning_rate": 3.556546028156746e-05, "loss": 0.549, "step": 2977 }, { "epoch": 0.23185246460026276, "grad_norm": 2.0713304032819013, "learning_rate": 3.5562325312234666e-05, "loss": 0.5074, "step": 2978 }, { "epoch": 0.23193031969247238, "grad_norm": 2.3246607136278215, "learning_rate": 3.555918937344215e-05, "loss": 0.5411, "step": 2979 }, { "epoch": 0.232008174784682, "grad_norm": 2.040662801368384, "learning_rate": 3.555605246538528e-05, "loss": 0.5559, "step": 2980 }, { "epoch": 0.23208602987689164, "grad_norm": 1.999438982530565, "learning_rate": 3.555291458825945e-05, "loss": 0.5202, "step": 2981 }, { "epoch": 0.23216388496910126, "grad_norm": 2.2095097336590754, "learning_rate": 3.554977574226014e-05, "loss": 0.5254, "step": 2982 }, { "epoch": 0.23224174006131088, "grad_norm": 1.9061080859730435, "learning_rate": 3.554663592758291e-05, "loss": 0.5213, "step": 2983 }, { "epoch": 0.23231959515352052, "grad_norm": 2.2052454765349503, "learning_rate": 3.554349514442333e-05, "loss": 0.6125, "step": 2984 }, { "epoch": 0.23239745024573014, "grad_norm": 1.97147716559114, "learning_rate": 3.554035339297707e-05, "loss": 0.5165, "step": 2985 }, { "epoch": 0.23247530533793975, "grad_norm": 1.967096092333465, "learning_rate": 3.553721067343982e-05, "loss": 0.545, "step": 2986 }, { "epoch": 0.2325531604301494, "grad_norm": 1.9922491433380998, "learning_rate": 3.5534066986007394e-05, "loss": 0.5208, "step": 2987 }, { "epoch": 0.232631015522359, "grad_norm": 2.00202987887441, "learning_rate": 3.553092233087561e-05, "loss": 0.5467, "step": 2988 }, { "epoch": 0.23270887061456863, "grad_norm": 1.9450979788033211, "learning_rate": 3.552777670824037e-05, "loss": 0.4796, "step": 2989 }, { "epoch": 0.23278672570677825, "grad_norm": 2.108653289492904, "learning_rate": 3.5524630118297615e-05, "loss": 0.5485, "step": 2990 }, { "epoch": 0.2328645807989879, "grad_norm": 2.1572243996063984, "learning_rate": 3.552148256124338e-05, "loss": 0.5513, "step": 2991 }, { "epoch": 0.2329424358911975, "grad_norm": 2.370697727657727, "learning_rate": 3.551833403727374e-05, "loss": 0.601, "step": 2992 }, { "epoch": 0.23302029098340712, "grad_norm": 2.2098146580897575, "learning_rate": 3.5515184546584836e-05, "loss": 0.6097, "step": 2993 }, { "epoch": 0.23309814607561677, "grad_norm": 2.066577867841437, "learning_rate": 3.551203408937286e-05, "loss": 0.56, "step": 2994 }, { "epoch": 0.23317600116782639, "grad_norm": 2.2070028043564323, "learning_rate": 3.5508882665834066e-05, "loss": 0.5626, "step": 2995 }, { "epoch": 0.233253856260036, "grad_norm": 1.955483651880036, "learning_rate": 3.550573027616479e-05, "loss": 0.5141, "step": 2996 }, { "epoch": 0.23333171135224565, "grad_norm": 1.8949094921812473, "learning_rate": 3.55025769205614e-05, "loss": 0.5194, "step": 2997 }, { "epoch": 0.23340956644445526, "grad_norm": 2.114623833406131, "learning_rate": 3.5499422599220334e-05, "loss": 0.5206, "step": 2998 }, { "epoch": 0.23348742153666488, "grad_norm": 2.199407456816248, "learning_rate": 3.549626731233809e-05, "loss": 0.5616, "step": 2999 }, { "epoch": 0.2335652766288745, "grad_norm": 2.121489538856215, "learning_rate": 3.549311106011124e-05, "loss": 0.5189, "step": 3000 }, { "epoch": 0.2335652766288745, "eval_loss": 0.06754348427057266, "eval_runtime": 162.2705, "eval_samples_per_second": 17.748, "eval_steps_per_second": 0.635, "step": 3000 }, { "epoch": 0.23364313172108414, "grad_norm": 1.9962701200764148, "learning_rate": 3.548995384273639e-05, "loss": 0.5421, "step": 3001 }, { "epoch": 0.23372098681329376, "grad_norm": 2.0834696014908958, "learning_rate": 3.5486795660410224e-05, "loss": 0.4806, "step": 3002 }, { "epoch": 0.23379884190550337, "grad_norm": 2.091354188762615, "learning_rate": 3.548363651332949e-05, "loss": 0.5294, "step": 3003 }, { "epoch": 0.23387669699771302, "grad_norm": 1.991662696756993, "learning_rate": 3.548047640169097e-05, "loss": 0.5054, "step": 3004 }, { "epoch": 0.23395455208992263, "grad_norm": 2.106153232499626, "learning_rate": 3.547731532569154e-05, "loss": 0.5116, "step": 3005 }, { "epoch": 0.23403240718213225, "grad_norm": 2.2335776320228695, "learning_rate": 3.5474153285528116e-05, "loss": 0.5581, "step": 3006 }, { "epoch": 0.23411026227434187, "grad_norm": 2.2412066677105873, "learning_rate": 3.547099028139767e-05, "loss": 0.5931, "step": 3007 }, { "epoch": 0.2341881173665515, "grad_norm": 2.032772002635901, "learning_rate": 3.546782631349726e-05, "loss": 0.4676, "step": 3008 }, { "epoch": 0.23426597245876113, "grad_norm": 1.9880326275636813, "learning_rate": 3.546466138202397e-05, "loss": 0.5477, "step": 3009 }, { "epoch": 0.23434382755097075, "grad_norm": 2.1170447210671033, "learning_rate": 3.5461495487174976e-05, "loss": 0.5668, "step": 3010 }, { "epoch": 0.2344216826431804, "grad_norm": 2.2103656439876267, "learning_rate": 3.545832862914747e-05, "loss": 0.546, "step": 3011 }, { "epoch": 0.23449953773539, "grad_norm": 2.173712239452167, "learning_rate": 3.545516080813877e-05, "loss": 0.5157, "step": 3012 }, { "epoch": 0.23457739282759962, "grad_norm": 1.9387904607976372, "learning_rate": 3.5451992024346183e-05, "loss": 0.4791, "step": 3013 }, { "epoch": 0.23465524791980927, "grad_norm": 2.109196431679814, "learning_rate": 3.544882227796713e-05, "loss": 0.5199, "step": 3014 }, { "epoch": 0.23473310301201888, "grad_norm": 1.9977135768482568, "learning_rate": 3.5445651569199064e-05, "loss": 0.5065, "step": 3015 }, { "epoch": 0.2348109581042285, "grad_norm": 2.292458732228027, "learning_rate": 3.5442479898239505e-05, "loss": 0.5563, "step": 3016 }, { "epoch": 0.23488881319643812, "grad_norm": 2.2660437864186767, "learning_rate": 3.543930726528604e-05, "loss": 0.601, "step": 3017 }, { "epoch": 0.23496666828864776, "grad_norm": 2.25997753877146, "learning_rate": 3.5436133670536294e-05, "loss": 0.5581, "step": 3018 }, { "epoch": 0.23504452338085738, "grad_norm": 2.0448200321099064, "learning_rate": 3.543295911418799e-05, "loss": 0.5348, "step": 3019 }, { "epoch": 0.235122378473067, "grad_norm": 2.4259940994618523, "learning_rate": 3.5429783596438864e-05, "loss": 0.5507, "step": 3020 }, { "epoch": 0.23520023356527664, "grad_norm": 2.0620772138159653, "learning_rate": 3.5426607117486746e-05, "loss": 0.5451, "step": 3021 }, { "epoch": 0.23527808865748626, "grad_norm": 2.057971307921385, "learning_rate": 3.542342967752952e-05, "loss": 0.5161, "step": 3022 }, { "epoch": 0.23535594374969587, "grad_norm": 2.0080590907207285, "learning_rate": 3.542025127676512e-05, "loss": 0.5022, "step": 3023 }, { "epoch": 0.23543379884190552, "grad_norm": 2.208488319468777, "learning_rate": 3.5417071915391546e-05, "loss": 0.5615, "step": 3024 }, { "epoch": 0.23551165393411513, "grad_norm": 2.0465075729623603, "learning_rate": 3.541389159360686e-05, "loss": 0.5331, "step": 3025 }, { "epoch": 0.23558950902632475, "grad_norm": 2.0790232915573434, "learning_rate": 3.541071031160918e-05, "loss": 0.4949, "step": 3026 }, { "epoch": 0.23566736411853437, "grad_norm": 2.104191541921189, "learning_rate": 3.5407528069596685e-05, "loss": 0.5491, "step": 3027 }, { "epoch": 0.235745219210744, "grad_norm": 2.1100118055401658, "learning_rate": 3.5404344867767605e-05, "loss": 0.522, "step": 3028 }, { "epoch": 0.23582307430295363, "grad_norm": 2.1002069234621743, "learning_rate": 3.540116070632026e-05, "loss": 0.5097, "step": 3029 }, { "epoch": 0.23590092939516324, "grad_norm": 2.2695593819101636, "learning_rate": 3.5397975585452985e-05, "loss": 0.5643, "step": 3030 }, { "epoch": 0.2359787844873729, "grad_norm": 2.0975127651352397, "learning_rate": 3.5394789505364216e-05, "loss": 0.5656, "step": 3031 }, { "epoch": 0.2360566395795825, "grad_norm": 2.0741199292734644, "learning_rate": 3.539160246625241e-05, "loss": 0.5585, "step": 3032 }, { "epoch": 0.23613449467179212, "grad_norm": 2.07030478760863, "learning_rate": 3.538841446831613e-05, "loss": 0.521, "step": 3033 }, { "epoch": 0.23621234976400174, "grad_norm": 2.0037082296544835, "learning_rate": 3.538522551175395e-05, "loss": 0.5319, "step": 3034 }, { "epoch": 0.23629020485621138, "grad_norm": 2.0598696875021485, "learning_rate": 3.5382035596764544e-05, "loss": 0.5386, "step": 3035 }, { "epoch": 0.236368059948421, "grad_norm": 2.097862344048903, "learning_rate": 3.5378844723546624e-05, "loss": 0.5585, "step": 3036 }, { "epoch": 0.23644591504063062, "grad_norm": 2.2559932064776866, "learning_rate": 3.537565289229896e-05, "loss": 0.5851, "step": 3037 }, { "epoch": 0.23652377013284026, "grad_norm": 2.053552286420964, "learning_rate": 3.5372460103220395e-05, "loss": 0.5001, "step": 3038 }, { "epoch": 0.23660162522504988, "grad_norm": 2.056205912491825, "learning_rate": 3.536926635650982e-05, "loss": 0.5032, "step": 3039 }, { "epoch": 0.2366794803172595, "grad_norm": 1.9434436944731353, "learning_rate": 3.5366071652366196e-05, "loss": 0.4443, "step": 3040 }, { "epoch": 0.23675733540946914, "grad_norm": 2.19198867554439, "learning_rate": 3.536287599098854e-05, "loss": 0.5303, "step": 3041 }, { "epoch": 0.23683519050167876, "grad_norm": 2.301065254634548, "learning_rate": 3.5359679372575914e-05, "loss": 0.5339, "step": 3042 }, { "epoch": 0.23691304559388837, "grad_norm": 2.3289478601843485, "learning_rate": 3.535648179732746e-05, "loss": 0.516, "step": 3043 }, { "epoch": 0.236990900686098, "grad_norm": 2.137056795000239, "learning_rate": 3.535328326544237e-05, "loss": 0.5079, "step": 3044 }, { "epoch": 0.23706875577830763, "grad_norm": 2.246904461531339, "learning_rate": 3.5350083777119904e-05, "loss": 0.5671, "step": 3045 }, { "epoch": 0.23714661087051725, "grad_norm": 2.145565014134147, "learning_rate": 3.534688333255937e-05, "loss": 0.5285, "step": 3046 }, { "epoch": 0.23722446596272687, "grad_norm": 2.029488899271388, "learning_rate": 3.5343681931960136e-05, "loss": 0.5509, "step": 3047 }, { "epoch": 0.2373023210549365, "grad_norm": 2.0166274622749722, "learning_rate": 3.534047957552165e-05, "loss": 0.4952, "step": 3048 }, { "epoch": 0.23738017614714613, "grad_norm": 2.0596746784158655, "learning_rate": 3.533727626344338e-05, "loss": 0.5264, "step": 3049 }, { "epoch": 0.23745803123935574, "grad_norm": 2.030471293531828, "learning_rate": 3.5334071995924905e-05, "loss": 0.5445, "step": 3050 }, { "epoch": 0.23745803123935574, "eval_loss": 0.06559255719184875, "eval_runtime": 162.4651, "eval_samples_per_second": 17.727, "eval_steps_per_second": 0.634, "step": 3050 }, { "epoch": 0.23753588633156536, "grad_norm": 2.16964807642937, "learning_rate": 3.533086677316581e-05, "loss": 0.5881, "step": 3051 }, { "epoch": 0.237613741423775, "grad_norm": 2.1297548390959706, "learning_rate": 3.5327660595365785e-05, "loss": 0.5349, "step": 3052 }, { "epoch": 0.23769159651598462, "grad_norm": 1.992466155571532, "learning_rate": 3.532445346272454e-05, "loss": 0.4828, "step": 3053 }, { "epoch": 0.23776945160819424, "grad_norm": 2.0361714391138865, "learning_rate": 3.5321245375441885e-05, "loss": 0.5515, "step": 3054 }, { "epoch": 0.23784730670040388, "grad_norm": 2.074602002934138, "learning_rate": 3.5318036333717654e-05, "loss": 0.5262, "step": 3055 }, { "epoch": 0.2379251617926135, "grad_norm": 2.131990877775608, "learning_rate": 3.531482633775176e-05, "loss": 0.5352, "step": 3056 }, { "epoch": 0.23800301688482312, "grad_norm": 2.1043493330770637, "learning_rate": 3.5311615387744175e-05, "loss": 0.5072, "step": 3057 }, { "epoch": 0.23808087197703276, "grad_norm": 1.944930785995719, "learning_rate": 3.5308403483894924e-05, "loss": 0.4799, "step": 3058 }, { "epoch": 0.23815872706924238, "grad_norm": 1.9648723414413323, "learning_rate": 3.530519062640409e-05, "loss": 0.4854, "step": 3059 }, { "epoch": 0.238236582161452, "grad_norm": 2.028337622310214, "learning_rate": 3.530197681547182e-05, "loss": 0.4902, "step": 3060 }, { "epoch": 0.2383144372536616, "grad_norm": 1.9119982778001692, "learning_rate": 3.529876205129831e-05, "loss": 0.475, "step": 3061 }, { "epoch": 0.23839229234587125, "grad_norm": 1.9506393606134347, "learning_rate": 3.529554633408385e-05, "loss": 0.506, "step": 3062 }, { "epoch": 0.23847014743808087, "grad_norm": 1.9982477659026006, "learning_rate": 3.529232966402874e-05, "loss": 0.4985, "step": 3063 }, { "epoch": 0.2385480025302905, "grad_norm": 2.2052842560313644, "learning_rate": 3.528911204133338e-05, "loss": 0.5796, "step": 3064 }, { "epoch": 0.23862585762250013, "grad_norm": 2.0162404708744224, "learning_rate": 3.52858934661982e-05, "loss": 0.4838, "step": 3065 }, { "epoch": 0.23870371271470975, "grad_norm": 1.887352176532689, "learning_rate": 3.5282673938823705e-05, "loss": 0.4682, "step": 3066 }, { "epoch": 0.23878156780691936, "grad_norm": 1.977329839298496, "learning_rate": 3.527945345941046e-05, "loss": 0.4691, "step": 3067 }, { "epoch": 0.238859422899129, "grad_norm": 1.9185531672227938, "learning_rate": 3.527623202815908e-05, "loss": 0.5176, "step": 3068 }, { "epoch": 0.23893727799133863, "grad_norm": 2.0979524442570807, "learning_rate": 3.5273009645270266e-05, "loss": 0.5238, "step": 3069 }, { "epoch": 0.23901513308354824, "grad_norm": 2.066751069942327, "learning_rate": 3.5269786310944725e-05, "loss": 0.5365, "step": 3070 }, { "epoch": 0.23909298817575786, "grad_norm": 1.9050897776888958, "learning_rate": 3.5266562025383275e-05, "loss": 0.4724, "step": 3071 }, { "epoch": 0.2391708432679675, "grad_norm": 2.063728123264731, "learning_rate": 3.526333678878677e-05, "loss": 0.5248, "step": 3072 }, { "epoch": 0.23924869836017712, "grad_norm": 2.2508376259623466, "learning_rate": 3.526011060135613e-05, "loss": 0.5451, "step": 3073 }, { "epoch": 0.23932655345238674, "grad_norm": 2.0828871102499447, "learning_rate": 3.5256883463292326e-05, "loss": 0.5288, "step": 3074 }, { "epoch": 0.23940440854459638, "grad_norm": 1.8669497386267193, "learning_rate": 3.525365537479639e-05, "loss": 0.5317, "step": 3075 }, { "epoch": 0.239482263636806, "grad_norm": 2.2105868068084216, "learning_rate": 3.525042633606943e-05, "loss": 0.5759, "step": 3076 }, { "epoch": 0.23956011872901561, "grad_norm": 1.999154164074613, "learning_rate": 3.524719634731259e-05, "loss": 0.5183, "step": 3077 }, { "epoch": 0.23963797382122523, "grad_norm": 2.284462818335656, "learning_rate": 3.524396540872709e-05, "loss": 0.587, "step": 3078 }, { "epoch": 0.23971582891343488, "grad_norm": 2.016716656938628, "learning_rate": 3.5240733520514194e-05, "loss": 0.5497, "step": 3079 }, { "epoch": 0.2397936840056445, "grad_norm": 1.9622584648493229, "learning_rate": 3.523750068287524e-05, "loss": 0.4902, "step": 3080 }, { "epoch": 0.2398715390978541, "grad_norm": 2.1239434820065477, "learning_rate": 3.5234266896011614e-05, "loss": 0.5383, "step": 3081 }, { "epoch": 0.23994939419006375, "grad_norm": 2.063287533264218, "learning_rate": 3.5231032160124765e-05, "loss": 0.5058, "step": 3082 }, { "epoch": 0.24002724928227337, "grad_norm": 1.8526030906078923, "learning_rate": 3.5227796475416206e-05, "loss": 0.4952, "step": 3083 }, { "epoch": 0.24010510437448299, "grad_norm": 2.1761976404647623, "learning_rate": 3.5224559842087504e-05, "loss": 0.5167, "step": 3084 }, { "epoch": 0.24018295946669263, "grad_norm": 2.103680846516352, "learning_rate": 3.5221322260340294e-05, "loss": 0.5374, "step": 3085 }, { "epoch": 0.24026081455890225, "grad_norm": 1.996078115490674, "learning_rate": 3.5218083730376244e-05, "loss": 0.5111, "step": 3086 }, { "epoch": 0.24033866965111186, "grad_norm": 2.003594976791865, "learning_rate": 3.521484425239711e-05, "loss": 0.4961, "step": 3087 }, { "epoch": 0.24041652474332148, "grad_norm": 1.9964484157830382, "learning_rate": 3.5211603826604696e-05, "loss": 0.5091, "step": 3088 }, { "epoch": 0.24049437983553112, "grad_norm": 1.9336934164959667, "learning_rate": 3.520836245320086e-05, "loss": 0.5065, "step": 3089 }, { "epoch": 0.24057223492774074, "grad_norm": 1.9933617376836572, "learning_rate": 3.520512013238754e-05, "loss": 0.5197, "step": 3090 }, { "epoch": 0.24065009001995036, "grad_norm": 2.0917158179870095, "learning_rate": 3.52018768643667e-05, "loss": 0.4938, "step": 3091 }, { "epoch": 0.24072794511216, "grad_norm": 2.247863842759161, "learning_rate": 3.519863264934038e-05, "loss": 0.5606, "step": 3092 }, { "epoch": 0.24080580020436962, "grad_norm": 2.1495320754818286, "learning_rate": 3.51953874875107e-05, "loss": 0.5107, "step": 3093 }, { "epoch": 0.24088365529657924, "grad_norm": 2.056777644005089, "learning_rate": 3.5192141379079794e-05, "loss": 0.5189, "step": 3094 }, { "epoch": 0.24096151038878888, "grad_norm": 2.0533507806624876, "learning_rate": 3.518889432424989e-05, "loss": 0.5275, "step": 3095 }, { "epoch": 0.2410393654809985, "grad_norm": 2.0505602787004684, "learning_rate": 3.518564632322327e-05, "loss": 0.5473, "step": 3096 }, { "epoch": 0.2411172205732081, "grad_norm": 1.9919370473414302, "learning_rate": 3.518239737620225e-05, "loss": 0.4878, "step": 3097 }, { "epoch": 0.24119507566541773, "grad_norm": 2.0695756760946216, "learning_rate": 3.517914748338925e-05, "loss": 0.5201, "step": 3098 }, { "epoch": 0.24127293075762737, "grad_norm": 2.140832630446555, "learning_rate": 3.517589664498671e-05, "loss": 0.5421, "step": 3099 }, { "epoch": 0.241350785849837, "grad_norm": 2.1460997257701875, "learning_rate": 3.517264486119714e-05, "loss": 0.5113, "step": 3100 }, { "epoch": 0.241350785849837, "eval_loss": 0.0641859620809555, "eval_runtime": 162.2274, "eval_samples_per_second": 17.753, "eval_steps_per_second": 0.635, "step": 3100 }, { "epoch": 0.2414286409420466, "grad_norm": 2.1723364006139425, "learning_rate": 3.5169392132223104e-05, "loss": 0.5791, "step": 3101 }, { "epoch": 0.24150649603425625, "grad_norm": 1.9375429752062936, "learning_rate": 3.5166138458267246e-05, "loss": 0.4713, "step": 3102 }, { "epoch": 0.24158435112646587, "grad_norm": 2.1348989196767736, "learning_rate": 3.516288383953225e-05, "loss": 0.5001, "step": 3103 }, { "epoch": 0.24166220621867548, "grad_norm": 2.0058907946949347, "learning_rate": 3.5159628276220864e-05, "loss": 0.5348, "step": 3104 }, { "epoch": 0.2417400613108851, "grad_norm": 2.155593720223828, "learning_rate": 3.515637176853589e-05, "loss": 0.5159, "step": 3105 }, { "epoch": 0.24181791640309475, "grad_norm": 2.1589956907022825, "learning_rate": 3.515311431668019e-05, "loss": 0.5142, "step": 3106 }, { "epoch": 0.24189577149530436, "grad_norm": 2.17064662672794, "learning_rate": 3.514985592085671e-05, "loss": 0.4993, "step": 3107 }, { "epoch": 0.24197362658751398, "grad_norm": 2.1340633425523112, "learning_rate": 3.514659658126841e-05, "loss": 0.5425, "step": 3108 }, { "epoch": 0.24205148167972362, "grad_norm": 2.109560719390489, "learning_rate": 3.514333629811833e-05, "loss": 0.4964, "step": 3109 }, { "epoch": 0.24212933677193324, "grad_norm": 2.182587110747358, "learning_rate": 3.514007507160959e-05, "loss": 0.5475, "step": 3110 }, { "epoch": 0.24220719186414286, "grad_norm": 2.0449065192885247, "learning_rate": 3.513681290194533e-05, "loss": 0.5066, "step": 3111 }, { "epoch": 0.2422850469563525, "grad_norm": 1.947773095661681, "learning_rate": 3.513354978932878e-05, "loss": 0.477, "step": 3112 }, { "epoch": 0.24236290204856212, "grad_norm": 2.2814454125067605, "learning_rate": 3.5130285733963214e-05, "loss": 0.5501, "step": 3113 }, { "epoch": 0.24244075714077173, "grad_norm": 2.123388721083193, "learning_rate": 3.512702073605196e-05, "loss": 0.5011, "step": 3114 }, { "epoch": 0.24251861223298135, "grad_norm": 2.1787918570112317, "learning_rate": 3.5123754795798416e-05, "loss": 0.5554, "step": 3115 }, { "epoch": 0.242596467325191, "grad_norm": 2.1208487344701124, "learning_rate": 3.5120487913406044e-05, "loss": 0.5321, "step": 3116 }, { "epoch": 0.2426743224174006, "grad_norm": 1.9685240724852995, "learning_rate": 3.511722008907834e-05, "loss": 0.4765, "step": 3117 }, { "epoch": 0.24275217750961023, "grad_norm": 1.9831598289646857, "learning_rate": 3.511395132301889e-05, "loss": 0.4686, "step": 3118 }, { "epoch": 0.24283003260181987, "grad_norm": 2.0262812313942096, "learning_rate": 3.511068161543131e-05, "loss": 0.5174, "step": 3119 }, { "epoch": 0.2429078876940295, "grad_norm": 2.1414565892643536, "learning_rate": 3.510741096651929e-05, "loss": 0.5191, "step": 3120 }, { "epoch": 0.2429857427862391, "grad_norm": 2.0575204698991416, "learning_rate": 3.510413937648658e-05, "loss": 0.4952, "step": 3121 }, { "epoch": 0.24306359787844875, "grad_norm": 2.1449393260034055, "learning_rate": 3.510086684553698e-05, "loss": 0.5451, "step": 3122 }, { "epoch": 0.24314145297065837, "grad_norm": 2.177795623715772, "learning_rate": 3.5097593373874354e-05, "loss": 0.5397, "step": 3123 }, { "epoch": 0.24321930806286798, "grad_norm": 2.0525012572561616, "learning_rate": 3.509431896170263e-05, "loss": 0.4879, "step": 3124 }, { "epoch": 0.2432971631550776, "grad_norm": 1.9702471822813854, "learning_rate": 3.509104360922578e-05, "loss": 0.5138, "step": 3125 }, { "epoch": 0.24337501824728724, "grad_norm": 1.9891909027361876, "learning_rate": 3.5087767316647854e-05, "loss": 0.5551, "step": 3126 }, { "epoch": 0.24345287333949686, "grad_norm": 2.0491993814591707, "learning_rate": 3.508449008417294e-05, "loss": 0.4777, "step": 3127 }, { "epoch": 0.24353072843170648, "grad_norm": 2.0497932185003207, "learning_rate": 3.5081211912005195e-05, "loss": 0.4911, "step": 3128 }, { "epoch": 0.24360858352391612, "grad_norm": 2.0167302176754354, "learning_rate": 3.507793280034884e-05, "loss": 0.5401, "step": 3129 }, { "epoch": 0.24368643861612574, "grad_norm": 1.9145471698263359, "learning_rate": 3.5074652749408145e-05, "loss": 0.5514, "step": 3130 }, { "epoch": 0.24376429370833536, "grad_norm": 2.041189075978616, "learning_rate": 3.507137175938743e-05, "loss": 0.4869, "step": 3131 }, { "epoch": 0.24384214880054497, "grad_norm": 2.0449749024838315, "learning_rate": 3.5068089830491106e-05, "loss": 0.4809, "step": 3132 }, { "epoch": 0.24392000389275462, "grad_norm": 2.00067010780316, "learning_rate": 3.506480696292361e-05, "loss": 0.4855, "step": 3133 }, { "epoch": 0.24399785898496423, "grad_norm": 1.9116982493283599, "learning_rate": 3.5061523156889454e-05, "loss": 0.5302, "step": 3134 }, { "epoch": 0.24407571407717385, "grad_norm": 2.0752597099311654, "learning_rate": 3.505823841259319e-05, "loss": 0.5273, "step": 3135 }, { "epoch": 0.2441535691693835, "grad_norm": 2.1448881470673817, "learning_rate": 3.505495273023947e-05, "loss": 0.5162, "step": 3136 }, { "epoch": 0.2442314242615931, "grad_norm": 1.9602403411118965, "learning_rate": 3.505166611003294e-05, "loss": 0.499, "step": 3137 }, { "epoch": 0.24430927935380273, "grad_norm": 2.14834849540293, "learning_rate": 3.5048378552178374e-05, "loss": 0.5316, "step": 3138 }, { "epoch": 0.24438713444601237, "grad_norm": 2.046738388009063, "learning_rate": 3.504509005688055e-05, "loss": 0.4794, "step": 3139 }, { "epoch": 0.244464989538222, "grad_norm": 2.180567688043021, "learning_rate": 3.504180062434434e-05, "loss": 0.5568, "step": 3140 }, { "epoch": 0.2445428446304316, "grad_norm": 2.019911851255761, "learning_rate": 3.503851025477466e-05, "loss": 0.4575, "step": 3141 }, { "epoch": 0.24462069972264122, "grad_norm": 2.00457909444739, "learning_rate": 3.5035218948376464e-05, "loss": 0.43, "step": 3142 }, { "epoch": 0.24469855481485087, "grad_norm": 2.065299110182051, "learning_rate": 3.5031926705354805e-05, "loss": 0.5418, "step": 3143 }, { "epoch": 0.24477640990706048, "grad_norm": 2.255212619932341, "learning_rate": 3.502863352591477e-05, "loss": 0.5344, "step": 3144 }, { "epoch": 0.2448542649992701, "grad_norm": 2.1415780567996587, "learning_rate": 3.5025339410261504e-05, "loss": 0.5106, "step": 3145 }, { "epoch": 0.24493212009147974, "grad_norm": 2.025651990051022, "learning_rate": 3.502204435860022e-05, "loss": 0.4862, "step": 3146 }, { "epoch": 0.24500997518368936, "grad_norm": 1.9967279217146638, "learning_rate": 3.5018748371136175e-05, "loss": 0.5011, "step": 3147 }, { "epoch": 0.24508783027589898, "grad_norm": 2.123485148157368, "learning_rate": 3.501545144807471e-05, "loss": 0.5162, "step": 3148 }, { "epoch": 0.24516568536810862, "grad_norm": 2.0111638932925477, "learning_rate": 3.50121535896212e-05, "loss": 0.4943, "step": 3149 }, { "epoch": 0.24524354046031824, "grad_norm": 1.9708105955426294, "learning_rate": 3.500885479598108e-05, "loss": 0.463, "step": 3150 }, { "epoch": 0.24524354046031824, "eval_loss": 0.0621529221534729, "eval_runtime": 162.5919, "eval_samples_per_second": 17.713, "eval_steps_per_second": 0.633, "step": 3150 }, { "epoch": 0.24532139555252785, "grad_norm": 2.023124965980449, "learning_rate": 3.5005555067359854e-05, "loss": 0.5114, "step": 3151 }, { "epoch": 0.24539925064473747, "grad_norm": 2.156757295062759, "learning_rate": 3.500225440396309e-05, "loss": 0.5299, "step": 3152 }, { "epoch": 0.24547710573694712, "grad_norm": 1.8599816393276118, "learning_rate": 3.499895280599637e-05, "loss": 0.4459, "step": 3153 }, { "epoch": 0.24555496082915673, "grad_norm": 2.013047648130111, "learning_rate": 3.499565027366541e-05, "loss": 0.4892, "step": 3154 }, { "epoch": 0.24563281592136635, "grad_norm": 1.985633856341916, "learning_rate": 3.499234680717592e-05, "loss": 0.5349, "step": 3155 }, { "epoch": 0.245710671013576, "grad_norm": 2.1471854783060893, "learning_rate": 3.4989042406733696e-05, "loss": 0.5198, "step": 3156 }, { "epoch": 0.2457885261057856, "grad_norm": 2.030615169771511, "learning_rate": 3.4985737072544585e-05, "loss": 0.5312, "step": 3157 }, { "epoch": 0.24586638119799523, "grad_norm": 1.8869437697737341, "learning_rate": 3.498243080481449e-05, "loss": 0.482, "step": 3158 }, { "epoch": 0.24594423629020484, "grad_norm": 2.0512784575299006, "learning_rate": 3.497912360374938e-05, "loss": 0.4701, "step": 3159 }, { "epoch": 0.2460220913824145, "grad_norm": 1.8344363953439387, "learning_rate": 3.4975815469555275e-05, "loss": 0.4411, "step": 3160 }, { "epoch": 0.2460999464746241, "grad_norm": 2.0914823859714478, "learning_rate": 3.497250640243826e-05, "loss": 0.5172, "step": 3161 }, { "epoch": 0.24617780156683372, "grad_norm": 2.141466031431025, "learning_rate": 3.4969196402604477e-05, "loss": 0.5026, "step": 3162 }, { "epoch": 0.24625565665904336, "grad_norm": 2.042246960851702, "learning_rate": 3.4965885470260125e-05, "loss": 0.5205, "step": 3163 }, { "epoch": 0.24633351175125298, "grad_norm": 2.078775264294762, "learning_rate": 3.4962573605611436e-05, "loss": 0.5135, "step": 3164 }, { "epoch": 0.2464113668434626, "grad_norm": 2.0504359497554305, "learning_rate": 3.495926080886476e-05, "loss": 0.53, "step": 3165 }, { "epoch": 0.24648922193567224, "grad_norm": 2.060012376161024, "learning_rate": 3.495594708022644e-05, "loss": 0.473, "step": 3166 }, { "epoch": 0.24656707702788186, "grad_norm": 2.1680854844335107, "learning_rate": 3.495263241990292e-05, "loss": 0.5157, "step": 3167 }, { "epoch": 0.24664493212009148, "grad_norm": 2.011028374611129, "learning_rate": 3.494931682810069e-05, "loss": 0.4751, "step": 3168 }, { "epoch": 0.2467227872123011, "grad_norm": 2.0321295682169547, "learning_rate": 3.4946000305026275e-05, "loss": 0.4993, "step": 3169 }, { "epoch": 0.24680064230451074, "grad_norm": 1.9426075244547467, "learning_rate": 3.49426828508863e-05, "loss": 0.4412, "step": 3170 }, { "epoch": 0.24687849739672035, "grad_norm": 1.971569264189066, "learning_rate": 3.493936446588742e-05, "loss": 0.5214, "step": 3171 }, { "epoch": 0.24695635248892997, "grad_norm": 1.9649610117444294, "learning_rate": 3.493604515023636e-05, "loss": 0.4783, "step": 3172 }, { "epoch": 0.24703420758113961, "grad_norm": 1.9456469695542118, "learning_rate": 3.4932724904139886e-05, "loss": 0.4431, "step": 3173 }, { "epoch": 0.24711206267334923, "grad_norm": 2.039931454075514, "learning_rate": 3.492940372780484e-05, "loss": 0.4986, "step": 3174 }, { "epoch": 0.24718991776555885, "grad_norm": 2.14132309582408, "learning_rate": 3.492608162143812e-05, "loss": 0.4872, "step": 3175 }, { "epoch": 0.2472677728577685, "grad_norm": 2.252232594148524, "learning_rate": 3.492275858524668e-05, "loss": 0.516, "step": 3176 }, { "epoch": 0.2473456279499781, "grad_norm": 2.020325238959167, "learning_rate": 3.491943461943751e-05, "loss": 0.5138, "step": 3177 }, { "epoch": 0.24742348304218773, "grad_norm": 2.2331826558034273, "learning_rate": 3.49161097242177e-05, "loss": 0.559, "step": 3178 }, { "epoch": 0.24750133813439734, "grad_norm": 2.0510390984785807, "learning_rate": 3.491278389979436e-05, "loss": 0.4892, "step": 3179 }, { "epoch": 0.247579193226607, "grad_norm": 2.0142915433133535, "learning_rate": 3.4909457146374684e-05, "loss": 0.4986, "step": 3180 }, { "epoch": 0.2476570483188166, "grad_norm": 1.9578841298657146, "learning_rate": 3.490612946416591e-05, "loss": 0.4989, "step": 3181 }, { "epoch": 0.24773490341102622, "grad_norm": 2.004512496789599, "learning_rate": 3.4902800853375334e-05, "loss": 0.4914, "step": 3182 }, { "epoch": 0.24781275850323586, "grad_norm": 2.2180285238161903, "learning_rate": 3.489947131421031e-05, "loss": 0.5171, "step": 3183 }, { "epoch": 0.24789061359544548, "grad_norm": 1.9325911904690949, "learning_rate": 3.4896140846878254e-05, "loss": 0.4705, "step": 3184 }, { "epoch": 0.2479684686876551, "grad_norm": 2.2541129706329652, "learning_rate": 3.489280945158665e-05, "loss": 0.4919, "step": 3185 }, { "epoch": 0.2480463237798647, "grad_norm": 2.0463173625815325, "learning_rate": 3.488947712854302e-05, "loss": 0.4742, "step": 3186 }, { "epoch": 0.24812417887207436, "grad_norm": 2.065250578194731, "learning_rate": 3.4886143877954944e-05, "loss": 0.5136, "step": 3187 }, { "epoch": 0.24820203396428397, "grad_norm": 2.0787414300729563, "learning_rate": 3.4882809700030076e-05, "loss": 0.5031, "step": 3188 }, { "epoch": 0.2482798890564936, "grad_norm": 1.9727957174674715, "learning_rate": 3.487947459497612e-05, "loss": 0.438, "step": 3189 }, { "epoch": 0.24835774414870324, "grad_norm": 1.8938763033095818, "learning_rate": 3.487613856300084e-05, "loss": 0.4651, "step": 3190 }, { "epoch": 0.24843559924091285, "grad_norm": 1.8827301404115748, "learning_rate": 3.487280160431206e-05, "loss": 0.4564, "step": 3191 }, { "epoch": 0.24851345433312247, "grad_norm": 1.8496417500154683, "learning_rate": 3.486946371911763e-05, "loss": 0.447, "step": 3192 }, { "epoch": 0.2485913094253321, "grad_norm": 2.118339502469233, "learning_rate": 3.486612490762551e-05, "loss": 0.4854, "step": 3193 }, { "epoch": 0.24866916451754173, "grad_norm": 2.030132492969817, "learning_rate": 3.486278517004369e-05, "loss": 0.4362, "step": 3194 }, { "epoch": 0.24874701960975135, "grad_norm": 2.114804697449367, "learning_rate": 3.485944450658021e-05, "loss": 0.4994, "step": 3195 }, { "epoch": 0.24882487470196096, "grad_norm": 2.1615571571147556, "learning_rate": 3.485610291744318e-05, "loss": 0.4803, "step": 3196 }, { "epoch": 0.2489027297941706, "grad_norm": 2.24166912708087, "learning_rate": 3.485276040284078e-05, "loss": 0.5462, "step": 3197 }, { "epoch": 0.24898058488638022, "grad_norm": 1.8916440901763691, "learning_rate": 3.484941696298121e-05, "loss": 0.4372, "step": 3198 }, { "epoch": 0.24905843997858984, "grad_norm": 1.8283251836655363, "learning_rate": 3.484607259807277e-05, "loss": 0.4094, "step": 3199 }, { "epoch": 0.24913629507079948, "grad_norm": 1.9046378580606307, "learning_rate": 3.4842727308323786e-05, "loss": 0.4398, "step": 3200 }, { "epoch": 0.24913629507079948, "eval_loss": 0.05973394587635994, "eval_runtime": 163.0056, "eval_samples_per_second": 17.668, "eval_steps_per_second": 0.632, "step": 3200 }, { "epoch": 0.2492141501630091, "grad_norm": 2.054547415300615, "learning_rate": 3.4839381093942654e-05, "loss": 0.5107, "step": 3201 }, { "epoch": 0.24929200525521872, "grad_norm": 2.0600869662324564, "learning_rate": 3.483603395513783e-05, "loss": 0.4503, "step": 3202 }, { "epoch": 0.24936986034742836, "grad_norm": 1.9161790250229067, "learning_rate": 3.4832685892117836e-05, "loss": 0.4701, "step": 3203 }, { "epoch": 0.24944771543963798, "grad_norm": 1.9403044338841833, "learning_rate": 3.482933690509122e-05, "loss": 0.4707, "step": 3204 }, { "epoch": 0.2495255705318476, "grad_norm": 2.1734699133591104, "learning_rate": 3.482598699426663e-05, "loss": 0.5514, "step": 3205 }, { "epoch": 0.2496034256240572, "grad_norm": 2.028756307477426, "learning_rate": 3.4822636159852735e-05, "loss": 0.4524, "step": 3206 }, { "epoch": 0.24968128071626686, "grad_norm": 2.108611184993435, "learning_rate": 3.481928440205827e-05, "loss": 0.4969, "step": 3207 }, { "epoch": 0.24975913580847647, "grad_norm": 1.9688428440524313, "learning_rate": 3.4815931721092055e-05, "loss": 0.4841, "step": 3208 }, { "epoch": 0.2498369909006861, "grad_norm": 2.1533555245003213, "learning_rate": 3.481257811716293e-05, "loss": 0.522, "step": 3209 }, { "epoch": 0.24991484599289573, "grad_norm": 1.8639280420370168, "learning_rate": 3.4809223590479815e-05, "loss": 0.4236, "step": 3210 }, { "epoch": 0.24999270108510535, "grad_norm": 1.8581609365890932, "learning_rate": 3.480586814125168e-05, "loss": 0.4154, "step": 3211 }, { "epoch": 0.250070556177315, "grad_norm": 1.9000028710569226, "learning_rate": 3.480251176968755e-05, "loss": 0.4538, "step": 3212 }, { "epoch": 0.2501484112695246, "grad_norm": 2.0256588490839125, "learning_rate": 3.479915447599652e-05, "loss": 0.4703, "step": 3213 }, { "epoch": 0.25022626636173423, "grad_norm": 1.9656712551528408, "learning_rate": 3.4795796260387725e-05, "loss": 0.4562, "step": 3214 }, { "epoch": 0.2503041214539439, "grad_norm": 2.0920538121171806, "learning_rate": 3.479243712307037e-05, "loss": 0.5179, "step": 3215 }, { "epoch": 0.25038197654615346, "grad_norm": 1.9816287065355007, "learning_rate": 3.478907706425371e-05, "loss": 0.4752, "step": 3216 }, { "epoch": 0.2504598316383631, "grad_norm": 1.9355406132877009, "learning_rate": 3.478571608414707e-05, "loss": 0.4639, "step": 3217 }, { "epoch": 0.2505376867305727, "grad_norm": 1.9751389030170596, "learning_rate": 3.478235418295981e-05, "loss": 0.464, "step": 3218 }, { "epoch": 0.25061554182278234, "grad_norm": 2.1359505380725854, "learning_rate": 3.477899136090137e-05, "loss": 0.5176, "step": 3219 }, { "epoch": 0.250693396914992, "grad_norm": 2.3105528610765185, "learning_rate": 3.477562761818123e-05, "loss": 0.5228, "step": 3220 }, { "epoch": 0.2507712520072016, "grad_norm": 2.0703438837755384, "learning_rate": 3.477226295500895e-05, "loss": 0.4906, "step": 3221 }, { "epoch": 0.2508491070994112, "grad_norm": 2.068998139046891, "learning_rate": 3.476889737159413e-05, "loss": 0.5206, "step": 3222 }, { "epoch": 0.25092696219162086, "grad_norm": 2.2142223955503626, "learning_rate": 3.476553086814641e-05, "loss": 0.5219, "step": 3223 }, { "epoch": 0.25100481728383045, "grad_norm": 1.9180557595174266, "learning_rate": 3.4762163444875524e-05, "loss": 0.417, "step": 3224 }, { "epoch": 0.2510826723760401, "grad_norm": 2.004689429581355, "learning_rate": 3.475879510199125e-05, "loss": 0.4579, "step": 3225 }, { "epoch": 0.25116052746824974, "grad_norm": 2.117780213893276, "learning_rate": 3.4755425839703406e-05, "loss": 0.4745, "step": 3226 }, { "epoch": 0.25123838256045933, "grad_norm": 2.109220720126347, "learning_rate": 3.4752055658221896e-05, "loss": 0.4604, "step": 3227 }, { "epoch": 0.251316237652669, "grad_norm": 1.82898478284936, "learning_rate": 3.474868455775665e-05, "loss": 0.4369, "step": 3228 }, { "epoch": 0.2513940927448786, "grad_norm": 1.9978999234998294, "learning_rate": 3.474531253851769e-05, "loss": 0.4611, "step": 3229 }, { "epoch": 0.2514719478370882, "grad_norm": 2.088202083141474, "learning_rate": 3.4741939600715055e-05, "loss": 0.4647, "step": 3230 }, { "epoch": 0.25154980292929785, "grad_norm": 2.1455315069637138, "learning_rate": 3.473856574455889e-05, "loss": 0.5043, "step": 3231 }, { "epoch": 0.2516276580215075, "grad_norm": 2.0722991454358914, "learning_rate": 3.473519097025935e-05, "loss": 0.506, "step": 3232 }, { "epoch": 0.2517055131137171, "grad_norm": 2.0762569209448225, "learning_rate": 3.473181527802668e-05, "loss": 0.5087, "step": 3233 }, { "epoch": 0.2517833682059267, "grad_norm": 2.0658479842272004, "learning_rate": 3.472843866807116e-05, "loss": 0.5067, "step": 3234 }, { "epoch": 0.2518612232981363, "grad_norm": 1.8615686959571012, "learning_rate": 3.4725061140603136e-05, "loss": 0.4194, "step": 3235 }, { "epoch": 0.25193907839034596, "grad_norm": 2.0929671468652735, "learning_rate": 3.472168269583303e-05, "loss": 0.47, "step": 3236 }, { "epoch": 0.2520169334825556, "grad_norm": 2.0519796570063558, "learning_rate": 3.471830333397128e-05, "loss": 0.4967, "step": 3237 }, { "epoch": 0.2520947885747652, "grad_norm": 1.9522706440505402, "learning_rate": 3.4714923055228417e-05, "loss": 0.4491, "step": 3238 }, { "epoch": 0.25217264366697484, "grad_norm": 1.9517008455570475, "learning_rate": 3.471154185981501e-05, "loss": 0.4657, "step": 3239 }, { "epoch": 0.2522504987591845, "grad_norm": 1.9801183253670505, "learning_rate": 3.4708159747941695e-05, "loss": 0.4792, "step": 3240 }, { "epoch": 0.25232835385139407, "grad_norm": 1.9898716536213836, "learning_rate": 3.4704776719819164e-05, "loss": 0.4991, "step": 3241 }, { "epoch": 0.2524062089436037, "grad_norm": 1.9252300417141606, "learning_rate": 3.470139277565817e-05, "loss": 0.4953, "step": 3242 }, { "epoch": 0.25248406403581336, "grad_norm": 1.9999207283691274, "learning_rate": 3.4698007915669495e-05, "loss": 0.4888, "step": 3243 }, { "epoch": 0.25256191912802295, "grad_norm": 2.0155502656692064, "learning_rate": 3.4694622140064025e-05, "loss": 0.4655, "step": 3244 }, { "epoch": 0.2526397742202326, "grad_norm": 1.982535632705253, "learning_rate": 3.469123544905266e-05, "loss": 0.4463, "step": 3245 }, { "epoch": 0.25271762931244224, "grad_norm": 1.9652503646802058, "learning_rate": 3.468784784284638e-05, "loss": 0.4645, "step": 3246 }, { "epoch": 0.2527954844046518, "grad_norm": 2.13920908551706, "learning_rate": 3.4684459321656226e-05, "loss": 0.5163, "step": 3247 }, { "epoch": 0.25287333949686147, "grad_norm": 1.9171252316372382, "learning_rate": 3.468106988569328e-05, "loss": 0.4321, "step": 3248 }, { "epoch": 0.2529511945890711, "grad_norm": 2.075598971121038, "learning_rate": 3.4677679535168675e-05, "loss": 0.462, "step": 3249 }, { "epoch": 0.2530290496812807, "grad_norm": 1.981902880794151, "learning_rate": 3.4674288270293636e-05, "loss": 0.4445, "step": 3250 }, { "epoch": 0.2530290496812807, "eval_loss": 0.05889367684721947, "eval_runtime": 163.2207, "eval_samples_per_second": 17.645, "eval_steps_per_second": 0.631, "step": 3250 }, { "epoch": 0.25310690477349035, "grad_norm": 2.0063870858513506, "learning_rate": 3.467089609127941e-05, "loss": 0.4787, "step": 3251 }, { "epoch": 0.2531847598657, "grad_norm": 1.950073987200417, "learning_rate": 3.4667502998337316e-05, "loss": 0.4333, "step": 3252 }, { "epoch": 0.2532626149579096, "grad_norm": 1.9278219909621135, "learning_rate": 3.466410899167873e-05, "loss": 0.4645, "step": 3253 }, { "epoch": 0.2533404700501192, "grad_norm": 1.93279558397697, "learning_rate": 3.466071407151508e-05, "loss": 0.4596, "step": 3254 }, { "epoch": 0.2534183251423288, "grad_norm": 2.1135615165475934, "learning_rate": 3.465731823805786e-05, "loss": 0.4963, "step": 3255 }, { "epoch": 0.25349618023453846, "grad_norm": 2.025868817104033, "learning_rate": 3.4653921491518604e-05, "loss": 0.4928, "step": 3256 }, { "epoch": 0.2535740353267481, "grad_norm": 1.9319306628934796, "learning_rate": 3.465052383210892e-05, "loss": 0.4999, "step": 3257 }, { "epoch": 0.2536518904189577, "grad_norm": 2.092441399497406, "learning_rate": 3.464712526004046e-05, "loss": 0.4885, "step": 3258 }, { "epoch": 0.25372974551116734, "grad_norm": 1.9343337509580247, "learning_rate": 3.464372577552495e-05, "loss": 0.4494, "step": 3259 }, { "epoch": 0.253807600603377, "grad_norm": 1.9320581417920504, "learning_rate": 3.4640325378774156e-05, "loss": 0.4608, "step": 3260 }, { "epoch": 0.25388545569558657, "grad_norm": 1.9375947313513786, "learning_rate": 3.463692406999991e-05, "loss": 0.4331, "step": 3261 }, { "epoch": 0.2539633107877962, "grad_norm": 1.8758101970897658, "learning_rate": 3.463352184941409e-05, "loss": 0.4332, "step": 3262 }, { "epoch": 0.25404116588000586, "grad_norm": 2.1355521275731233, "learning_rate": 3.463011871722864e-05, "loss": 0.5001, "step": 3263 }, { "epoch": 0.25411902097221545, "grad_norm": 2.048512945970756, "learning_rate": 3.462671467365556e-05, "loss": 0.4661, "step": 3264 }, { "epoch": 0.2541968760644251, "grad_norm": 1.926629467418869, "learning_rate": 3.4623309718906904e-05, "loss": 0.4569, "step": 3265 }, { "epoch": 0.25427473115663474, "grad_norm": 1.8559034821985734, "learning_rate": 3.46199038531948e-05, "loss": 0.4277, "step": 3266 }, { "epoch": 0.2543525862488443, "grad_norm": 1.9865376208371641, "learning_rate": 3.46164970767314e-05, "loss": 0.4743, "step": 3267 }, { "epoch": 0.25443044134105397, "grad_norm": 1.8825416421087837, "learning_rate": 3.461308938972893e-05, "loss": 0.4499, "step": 3268 }, { "epoch": 0.2545082964332636, "grad_norm": 1.939886243066616, "learning_rate": 3.460968079239969e-05, "loss": 0.4714, "step": 3269 }, { "epoch": 0.2545861515254732, "grad_norm": 1.990167386945708, "learning_rate": 3.4606271284956e-05, "loss": 0.4733, "step": 3270 }, { "epoch": 0.25466400661768285, "grad_norm": 2.0469640387248513, "learning_rate": 3.460286086761027e-05, "loss": 0.4913, "step": 3271 }, { "epoch": 0.25474186170989244, "grad_norm": 1.9131666808595746, "learning_rate": 3.459944954057494e-05, "loss": 0.4388, "step": 3272 }, { "epoch": 0.2548197168021021, "grad_norm": 2.194915137046519, "learning_rate": 3.4596037304062534e-05, "loss": 0.4595, "step": 3273 }, { "epoch": 0.2548975718943117, "grad_norm": 2.121744571793942, "learning_rate": 3.459262415828561e-05, "loss": 0.5223, "step": 3274 }, { "epoch": 0.2549754269865213, "grad_norm": 2.01731953483269, "learning_rate": 3.458921010345679e-05, "loss": 0.4806, "step": 3275 }, { "epoch": 0.25505328207873096, "grad_norm": 2.194625489455172, "learning_rate": 3.458579513978876e-05, "loss": 0.5014, "step": 3276 }, { "epoch": 0.2551311371709406, "grad_norm": 2.0848281626049587, "learning_rate": 3.4582379267494243e-05, "loss": 0.4787, "step": 3277 }, { "epoch": 0.2552089922631502, "grad_norm": 1.9815635606181674, "learning_rate": 3.457896248678605e-05, "loss": 0.4358, "step": 3278 }, { "epoch": 0.25528684735535984, "grad_norm": 2.061982031247591, "learning_rate": 3.457554479787702e-05, "loss": 0.5247, "step": 3279 }, { "epoch": 0.2553647024475695, "grad_norm": 2.121078852816025, "learning_rate": 3.457212620098005e-05, "loss": 0.4674, "step": 3280 }, { "epoch": 0.25544255753977907, "grad_norm": 1.7843544662381603, "learning_rate": 3.4568706696308126e-05, "loss": 0.4044, "step": 3281 }, { "epoch": 0.2555204126319887, "grad_norm": 1.811989978784589, "learning_rate": 3.456528628407425e-05, "loss": 0.4111, "step": 3282 }, { "epoch": 0.25559826772419836, "grad_norm": 1.9325088634854763, "learning_rate": 3.45618649644915e-05, "loss": 0.49, "step": 3283 }, { "epoch": 0.25567612281640795, "grad_norm": 2.0447503674064382, "learning_rate": 3.455844273777301e-05, "loss": 0.4451, "step": 3284 }, { "epoch": 0.2557539779086176, "grad_norm": 1.9137686892418604, "learning_rate": 3.4555019604131974e-05, "loss": 0.4138, "step": 3285 }, { "epoch": 0.25583183300082724, "grad_norm": 2.081384973352433, "learning_rate": 3.455159556378162e-05, "loss": 0.4539, "step": 3286 }, { "epoch": 0.2559096880930368, "grad_norm": 1.8342373151031852, "learning_rate": 3.454817061693526e-05, "loss": 0.4503, "step": 3287 }, { "epoch": 0.25598754318524647, "grad_norm": 2.224426689185693, "learning_rate": 3.454474476380626e-05, "loss": 0.5296, "step": 3288 }, { "epoch": 0.25606539827745606, "grad_norm": 2.09044358439606, "learning_rate": 3.454131800460803e-05, "loss": 0.45, "step": 3289 }, { "epoch": 0.2561432533696657, "grad_norm": 2.0403549790911653, "learning_rate": 3.4537890339554034e-05, "loss": 0.497, "step": 3290 }, { "epoch": 0.25622110846187535, "grad_norm": 1.9379714454574188, "learning_rate": 3.453446176885781e-05, "loss": 0.4217, "step": 3291 }, { "epoch": 0.25629896355408494, "grad_norm": 1.9190473278740285, "learning_rate": 3.453103229273293e-05, "loss": 0.4158, "step": 3292 }, { "epoch": 0.2563768186462946, "grad_norm": 1.965573479721424, "learning_rate": 3.4527601911393046e-05, "loss": 0.4707, "step": 3293 }, { "epoch": 0.2564546737385042, "grad_norm": 1.8908649774492063, "learning_rate": 3.452417062505184e-05, "loss": 0.4317, "step": 3294 }, { "epoch": 0.2565325288307138, "grad_norm": 2.1578273211370016, "learning_rate": 3.452073843392308e-05, "loss": 0.4644, "step": 3295 }, { "epoch": 0.25661038392292346, "grad_norm": 1.8774995458209893, "learning_rate": 3.4517305338220564e-05, "loss": 0.4527, "step": 3296 }, { "epoch": 0.2566882390151331, "grad_norm": 1.8946786016495096, "learning_rate": 3.451387133815817e-05, "loss": 0.4837, "step": 3297 }, { "epoch": 0.2567660941073427, "grad_norm": 2.0332336260914956, "learning_rate": 3.4510436433949805e-05, "loss": 0.4359, "step": 3298 }, { "epoch": 0.25684394919955233, "grad_norm": 1.8966354730615278, "learning_rate": 3.4507000625809456e-05, "loss": 0.4523, "step": 3299 }, { "epoch": 0.256921804291762, "grad_norm": 1.8073292126421403, "learning_rate": 3.450356391395116e-05, "loss": 0.3766, "step": 3300 }, { "epoch": 0.256921804291762, "eval_loss": 0.05744847282767296, "eval_runtime": 228.8767, "eval_samples_per_second": 12.583, "eval_steps_per_second": 0.45, "step": 3300 }, { "epoch": 0.25699965938397157, "grad_norm": 1.9191061352787695, "learning_rate": 3.4500126298589004e-05, "loss": 0.4774, "step": 3301 }, { "epoch": 0.2570775144761812, "grad_norm": 1.8235335546576426, "learning_rate": 3.449668777993714e-05, "loss": 0.4134, "step": 3302 }, { "epoch": 0.25715536956839086, "grad_norm": 2.025196805452496, "learning_rate": 3.449324835820976e-05, "loss": 0.4589, "step": 3303 }, { "epoch": 0.25723322466060045, "grad_norm": 1.98801009520012, "learning_rate": 3.4489808033621135e-05, "loss": 0.4474, "step": 3304 }, { "epoch": 0.2573110797528101, "grad_norm": 2.0132020060607387, "learning_rate": 3.448636680638558e-05, "loss": 0.4543, "step": 3305 }, { "epoch": 0.25738893484501973, "grad_norm": 2.02242465980134, "learning_rate": 3.4482924676717465e-05, "loss": 0.4945, "step": 3306 }, { "epoch": 0.2574667899372293, "grad_norm": 1.9769955469284437, "learning_rate": 3.4479481644831216e-05, "loss": 0.4424, "step": 3307 }, { "epoch": 0.25754464502943897, "grad_norm": 1.9841413256465172, "learning_rate": 3.4476037710941324e-05, "loss": 0.4635, "step": 3308 }, { "epoch": 0.25762250012164856, "grad_norm": 1.9574032725128372, "learning_rate": 3.447259287526232e-05, "loss": 0.4061, "step": 3309 }, { "epoch": 0.2577003552138582, "grad_norm": 1.985079990421644, "learning_rate": 3.446914713800881e-05, "loss": 0.4596, "step": 3310 }, { "epoch": 0.25777821030606785, "grad_norm": 2.1348766050099908, "learning_rate": 3.4465700499395445e-05, "loss": 0.4455, "step": 3311 }, { "epoch": 0.25785606539827743, "grad_norm": 1.937164576101299, "learning_rate": 3.446225295963694e-05, "loss": 0.4243, "step": 3312 }, { "epoch": 0.2579339204904871, "grad_norm": 1.913212468519538, "learning_rate": 3.4458804518948044e-05, "loss": 0.4629, "step": 3313 }, { "epoch": 0.2580117755826967, "grad_norm": 2.2461900429082435, "learning_rate": 3.445535517754359e-05, "loss": 0.457, "step": 3314 }, { "epoch": 0.2580896306749063, "grad_norm": 1.9398163566257869, "learning_rate": 3.4451904935638454e-05, "loss": 0.4118, "step": 3315 }, { "epoch": 0.25816748576711596, "grad_norm": 2.0404989727980807, "learning_rate": 3.4448453793447575e-05, "loss": 0.449, "step": 3316 }, { "epoch": 0.2582453408593256, "grad_norm": 2.0046862093974713, "learning_rate": 3.444500175118594e-05, "loss": 0.4422, "step": 3317 }, { "epoch": 0.2583231959515352, "grad_norm": 1.864961922499449, "learning_rate": 3.444154880906859e-05, "loss": 0.3954, "step": 3318 }, { "epoch": 0.25840105104374483, "grad_norm": 1.8292439374393934, "learning_rate": 3.4438094967310625e-05, "loss": 0.4184, "step": 3319 }, { "epoch": 0.2584789061359545, "grad_norm": 2.0709017145892474, "learning_rate": 3.4434640226127216e-05, "loss": 0.4261, "step": 3320 }, { "epoch": 0.25855676122816407, "grad_norm": 2.370678432007795, "learning_rate": 3.443118458573357e-05, "loss": 0.5807, "step": 3321 }, { "epoch": 0.2586346163203737, "grad_norm": 1.7131311118613874, "learning_rate": 3.442772804634495e-05, "loss": 0.415, "step": 3322 }, { "epoch": 0.25871247141258336, "grad_norm": 1.8610309183238567, "learning_rate": 3.4424270608176696e-05, "loss": 0.4104, "step": 3323 }, { "epoch": 0.25879032650479294, "grad_norm": 1.8304363224543294, "learning_rate": 3.442081227144418e-05, "loss": 0.4173, "step": 3324 }, { "epoch": 0.2588681815970026, "grad_norm": 1.9807751093158446, "learning_rate": 3.4417353036362845e-05, "loss": 0.4775, "step": 3325 }, { "epoch": 0.2589460366892122, "grad_norm": 1.8267285984191666, "learning_rate": 3.441389290314818e-05, "loss": 0.4892, "step": 3326 }, { "epoch": 0.2590238917814218, "grad_norm": 1.9733552948797122, "learning_rate": 3.441043187201574e-05, "loss": 0.4201, "step": 3327 }, { "epoch": 0.25910174687363147, "grad_norm": 1.8652154693040623, "learning_rate": 3.4406969943181134e-05, "loss": 0.4389, "step": 3328 }, { "epoch": 0.25917960196584106, "grad_norm": 1.9012518613542928, "learning_rate": 3.4403507116860016e-05, "loss": 0.4493, "step": 3329 }, { "epoch": 0.2592574570580507, "grad_norm": 2.1073573843997355, "learning_rate": 3.440004339326811e-05, "loss": 0.5039, "step": 3330 }, { "epoch": 0.25933531215026034, "grad_norm": 2.1156993436573464, "learning_rate": 3.439657877262118e-05, "loss": 0.5035, "step": 3331 }, { "epoch": 0.25941316724246993, "grad_norm": 1.956345154444442, "learning_rate": 3.439311325513506e-05, "loss": 0.4334, "step": 3332 }, { "epoch": 0.2594910223346796, "grad_norm": 1.8662147602463484, "learning_rate": 3.438964684102565e-05, "loss": 0.4401, "step": 3333 }, { "epoch": 0.2595688774268892, "grad_norm": 1.8911079876227306, "learning_rate": 3.438617953050887e-05, "loss": 0.5067, "step": 3334 }, { "epoch": 0.2596467325190988, "grad_norm": 1.844638176591285, "learning_rate": 3.438271132380073e-05, "loss": 0.4148, "step": 3335 }, { "epoch": 0.25972458761130846, "grad_norm": 2.036186944360366, "learning_rate": 3.437924222111728e-05, "loss": 0.4814, "step": 3336 }, { "epoch": 0.2598024427035181, "grad_norm": 2.0451402538094743, "learning_rate": 3.437577222267463e-05, "loss": 0.4709, "step": 3337 }, { "epoch": 0.2598802977957277, "grad_norm": 1.985864696698138, "learning_rate": 3.437230132868893e-05, "loss": 0.4631, "step": 3338 }, { "epoch": 0.25995815288793733, "grad_norm": 1.9891717239933349, "learning_rate": 3.4368829539376427e-05, "loss": 0.4399, "step": 3339 }, { "epoch": 0.260036007980147, "grad_norm": 2.2196262322857403, "learning_rate": 3.436535685495337e-05, "loss": 0.4435, "step": 3340 }, { "epoch": 0.26011386307235657, "grad_norm": 1.9569132083850609, "learning_rate": 3.4361883275636116e-05, "loss": 0.4175, "step": 3341 }, { "epoch": 0.2601917181645662, "grad_norm": 1.7595167779735668, "learning_rate": 3.4358408801641036e-05, "loss": 0.4151, "step": 3342 }, { "epoch": 0.2602695732567758, "grad_norm": 1.9940337801254573, "learning_rate": 3.4354933433184585e-05, "loss": 0.4686, "step": 3343 }, { "epoch": 0.26034742834898544, "grad_norm": 1.7666157819820312, "learning_rate": 3.435145717048324e-05, "loss": 0.3688, "step": 3344 }, { "epoch": 0.2604252834411951, "grad_norm": 1.746916454081431, "learning_rate": 3.434798001375359e-05, "loss": 0.3954, "step": 3345 }, { "epoch": 0.2605031385334047, "grad_norm": 1.9599079864329796, "learning_rate": 3.4344501963212216e-05, "loss": 0.4156, "step": 3346 }, { "epoch": 0.2605809936256143, "grad_norm": 1.8825396589193912, "learning_rate": 3.43410230190758e-05, "loss": 0.4513, "step": 3347 }, { "epoch": 0.26065884871782397, "grad_norm": 2.1192862918793187, "learning_rate": 3.433754318156105e-05, "loss": 0.4746, "step": 3348 }, { "epoch": 0.26073670381003355, "grad_norm": 2.0937966820573313, "learning_rate": 3.4334062450884755e-05, "loss": 0.4601, "step": 3349 }, { "epoch": 0.2608145589022432, "grad_norm": 1.8065558966470472, "learning_rate": 3.4330580827263745e-05, "loss": 0.4026, "step": 3350 }, { "epoch": 0.2608145589022432, "eval_loss": 0.05595027655363083, "eval_runtime": 162.2213, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 3350 }, { "epoch": 0.26089241399445284, "grad_norm": 1.900557666608592, "learning_rate": 3.432709831091491e-05, "loss": 0.4003, "step": 3351 }, { "epoch": 0.26097026908666243, "grad_norm": 2.033271472940577, "learning_rate": 3.4323614902055194e-05, "loss": 0.4643, "step": 3352 }, { "epoch": 0.2610481241788721, "grad_norm": 1.832691097728757, "learning_rate": 3.4320130600901604e-05, "loss": 0.3578, "step": 3353 }, { "epoch": 0.2611259792710817, "grad_norm": 2.0583842433867536, "learning_rate": 3.431664540767118e-05, "loss": 0.5049, "step": 3354 }, { "epoch": 0.2612038343632913, "grad_norm": 1.931165719733855, "learning_rate": 3.431315932258104e-05, "loss": 0.4337, "step": 3355 }, { "epoch": 0.26128168945550095, "grad_norm": 1.9349516914827167, "learning_rate": 3.4309672345848355e-05, "loss": 0.4392, "step": 3356 }, { "epoch": 0.2613595445477106, "grad_norm": 1.8984911351253986, "learning_rate": 3.4306184477690346e-05, "loss": 0.442, "step": 3357 }, { "epoch": 0.2614373996399202, "grad_norm": 1.8787450662959968, "learning_rate": 3.430269571832429e-05, "loss": 0.424, "step": 3358 }, { "epoch": 0.26151525473212983, "grad_norm": 1.9751304213165333, "learning_rate": 3.429920606796751e-05, "loss": 0.4368, "step": 3359 }, { "epoch": 0.2615931098243395, "grad_norm": 1.974881578204203, "learning_rate": 3.429571552683741e-05, "loss": 0.4401, "step": 3360 }, { "epoch": 0.26167096491654906, "grad_norm": 1.9937870638599653, "learning_rate": 3.4292224095151435e-05, "loss": 0.4359, "step": 3361 }, { "epoch": 0.2617488200087587, "grad_norm": 2.0443916711148673, "learning_rate": 3.428873177312707e-05, "loss": 0.4846, "step": 3362 }, { "epoch": 0.2618266751009683, "grad_norm": 1.829817270379315, "learning_rate": 3.4285238560981876e-05, "loss": 0.4436, "step": 3363 }, { "epoch": 0.26190453019317794, "grad_norm": 2.14954702561937, "learning_rate": 3.428174445893347e-05, "loss": 0.4933, "step": 3364 }, { "epoch": 0.2619823852853876, "grad_norm": 2.0509692099110897, "learning_rate": 3.4278249467199515e-05, "loss": 0.4736, "step": 3365 }, { "epoch": 0.2620602403775972, "grad_norm": 1.9137376788186857, "learning_rate": 3.427475358599773e-05, "loss": 0.4632, "step": 3366 }, { "epoch": 0.2621380954698068, "grad_norm": 2.021031483567976, "learning_rate": 3.42712568155459e-05, "loss": 0.4381, "step": 3367 }, { "epoch": 0.26221595056201646, "grad_norm": 2.2185286455103705, "learning_rate": 3.426775915606185e-05, "loss": 0.4996, "step": 3368 }, { "epoch": 0.26229380565422605, "grad_norm": 1.855102977499611, "learning_rate": 3.426426060776346e-05, "loss": 0.3967, "step": 3369 }, { "epoch": 0.2623716607464357, "grad_norm": 1.8572459875170086, "learning_rate": 3.426076117086869e-05, "loss": 0.4105, "step": 3370 }, { "epoch": 0.26244951583864534, "grad_norm": 2.071659757335504, "learning_rate": 3.425726084559554e-05, "loss": 0.4135, "step": 3371 }, { "epoch": 0.26252737093085493, "grad_norm": 2.135561194958634, "learning_rate": 3.4253759632162036e-05, "loss": 0.4598, "step": 3372 }, { "epoch": 0.2626052260230646, "grad_norm": 2.0716623552648614, "learning_rate": 3.4250257530786314e-05, "loss": 0.4564, "step": 3373 }, { "epoch": 0.2626830811152742, "grad_norm": 1.9690116888618414, "learning_rate": 3.424675454168653e-05, "loss": 0.3899, "step": 3374 }, { "epoch": 0.2627609362074838, "grad_norm": 1.9872598329196987, "learning_rate": 3.424325066508089e-05, "loss": 0.418, "step": 3375 }, { "epoch": 0.26283879129969345, "grad_norm": 2.1506533437950597, "learning_rate": 3.423974590118771e-05, "loss": 0.4772, "step": 3376 }, { "epoch": 0.2629166463919031, "grad_norm": 2.174386145397369, "learning_rate": 3.4236240250225275e-05, "loss": 0.5151, "step": 3377 }, { "epoch": 0.2629945014841127, "grad_norm": 2.0292436601165096, "learning_rate": 3.423273371241199e-05, "loss": 0.4393, "step": 3378 }, { "epoch": 0.26307235657632233, "grad_norm": 2.164283844507528, "learning_rate": 3.42292262879663e-05, "loss": 0.4528, "step": 3379 }, { "epoch": 0.2631502116685319, "grad_norm": 2.1623457345091057, "learning_rate": 3.422571797710669e-05, "loss": 0.4767, "step": 3380 }, { "epoch": 0.26322806676074156, "grad_norm": 1.936629340653123, "learning_rate": 3.422220878005171e-05, "loss": 0.4633, "step": 3381 }, { "epoch": 0.2633059218529512, "grad_norm": 1.9801304367395052, "learning_rate": 3.421869869701999e-05, "loss": 0.4716, "step": 3382 }, { "epoch": 0.2633837769451608, "grad_norm": 1.8211318682013213, "learning_rate": 3.421518772823016e-05, "loss": 0.3611, "step": 3383 }, { "epoch": 0.26346163203737044, "grad_norm": 1.9822090955587621, "learning_rate": 3.421167587390096e-05, "loss": 0.4548, "step": 3384 }, { "epoch": 0.2635394871295801, "grad_norm": 1.9273492731212087, "learning_rate": 3.420816313425116e-05, "loss": 0.4627, "step": 3385 }, { "epoch": 0.2636173422217897, "grad_norm": 2.033149920468028, "learning_rate": 3.420464950949957e-05, "loss": 0.431, "step": 3386 }, { "epoch": 0.2636951973139993, "grad_norm": 2.009066434302082, "learning_rate": 3.420113499986508e-05, "loss": 0.4601, "step": 3387 }, { "epoch": 0.26377305240620896, "grad_norm": 1.7824224361988275, "learning_rate": 3.419761960556664e-05, "loss": 0.371, "step": 3388 }, { "epoch": 0.26385090749841855, "grad_norm": 1.880307241804392, "learning_rate": 3.419410332682323e-05, "loss": 0.4317, "step": 3389 }, { "epoch": 0.2639287625906282, "grad_norm": 1.9289642885334204, "learning_rate": 3.419058616385391e-05, "loss": 0.4773, "step": 3390 }, { "epoch": 0.26400661768283784, "grad_norm": 1.8886147970072575, "learning_rate": 3.418706811687776e-05, "loss": 0.4392, "step": 3391 }, { "epoch": 0.26408447277504743, "grad_norm": 1.9318068963135675, "learning_rate": 3.418354918611396e-05, "loss": 0.427, "step": 3392 }, { "epoch": 0.2641623278672571, "grad_norm": 1.9607136376008079, "learning_rate": 3.418002937178171e-05, "loss": 0.4678, "step": 3393 }, { "epoch": 0.2642401829594667, "grad_norm": 1.8343508191075222, "learning_rate": 3.4176508674100284e-05, "loss": 0.4117, "step": 3394 }, { "epoch": 0.2643180380516763, "grad_norm": 1.7503313450184592, "learning_rate": 3.4172987093288994e-05, "loss": 0.3951, "step": 3395 }, { "epoch": 0.26439589314388595, "grad_norm": 2.0886330459543907, "learning_rate": 3.416946462956724e-05, "loss": 0.4389, "step": 3396 }, { "epoch": 0.26447374823609554, "grad_norm": 1.8896427605642907, "learning_rate": 3.416594128315444e-05, "loss": 0.444, "step": 3397 }, { "epoch": 0.2645516033283052, "grad_norm": 1.890265268926219, "learning_rate": 3.416241705427008e-05, "loss": 0.4208, "step": 3398 }, { "epoch": 0.26462945842051483, "grad_norm": 1.804768601070736, "learning_rate": 3.415889194313371e-05, "loss": 0.3584, "step": 3399 }, { "epoch": 0.2647073135127244, "grad_norm": 1.994905044960065, "learning_rate": 3.415536594996492e-05, "loss": 0.4553, "step": 3400 }, { "epoch": 0.2647073135127244, "eval_loss": 0.05510272458195686, "eval_runtime": 167.5935, "eval_samples_per_second": 17.184, "eval_steps_per_second": 0.615, "step": 3400 }, { "epoch": 0.26478516860493406, "grad_norm": 2.011707407681694, "learning_rate": 3.415183907498337e-05, "loss": 0.4441, "step": 3401 }, { "epoch": 0.2648630236971437, "grad_norm": 1.9379769615167868, "learning_rate": 3.414831131840877e-05, "loss": 0.4048, "step": 3402 }, { "epoch": 0.2649408787893533, "grad_norm": 2.066448836576142, "learning_rate": 3.414478268046087e-05, "loss": 0.4652, "step": 3403 }, { "epoch": 0.26501873388156294, "grad_norm": 1.8884135178317656, "learning_rate": 3.4141253161359495e-05, "loss": 0.3915, "step": 3404 }, { "epoch": 0.2650965889737726, "grad_norm": 1.9465037152291402, "learning_rate": 3.413772276132452e-05, "loss": 0.4357, "step": 3405 }, { "epoch": 0.2651744440659822, "grad_norm": 1.8923052487702148, "learning_rate": 3.413419148057587e-05, "loss": 0.4205, "step": 3406 }, { "epoch": 0.2652522991581918, "grad_norm": 1.9725746847654162, "learning_rate": 3.413065931933353e-05, "loss": 0.4211, "step": 3407 }, { "epoch": 0.26533015425040146, "grad_norm": 2.0357508604196664, "learning_rate": 3.412712627781753e-05, "loss": 0.4477, "step": 3408 }, { "epoch": 0.26540800934261105, "grad_norm": 1.8780596400788032, "learning_rate": 3.4123592356247974e-05, "loss": 0.4384, "step": 3409 }, { "epoch": 0.2654858644348207, "grad_norm": 1.9739466304191466, "learning_rate": 3.4120057554844996e-05, "loss": 0.4746, "step": 3410 }, { "epoch": 0.26556371952703034, "grad_norm": 1.9785957140843649, "learning_rate": 3.41165218738288e-05, "loss": 0.454, "step": 3411 }, { "epoch": 0.26564157461923993, "grad_norm": 1.9732404157959333, "learning_rate": 3.411298531341965e-05, "loss": 0.4385, "step": 3412 }, { "epoch": 0.2657194297114496, "grad_norm": 1.8914589121548013, "learning_rate": 3.410944787383785e-05, "loss": 0.4182, "step": 3413 }, { "epoch": 0.26579728480365916, "grad_norm": 1.9663303509096743, "learning_rate": 3.410590955530376e-05, "loss": 0.4314, "step": 3414 }, { "epoch": 0.2658751398958688, "grad_norm": 1.9763172978573404, "learning_rate": 3.410237035803782e-05, "loss": 0.4109, "step": 3415 }, { "epoch": 0.26595299498807845, "grad_norm": 1.8544676201510575, "learning_rate": 3.409883028226049e-05, "loss": 0.3771, "step": 3416 }, { "epoch": 0.26603085008028804, "grad_norm": 2.072374314644776, "learning_rate": 3.40952893281923e-05, "loss": 0.4213, "step": 3417 }, { "epoch": 0.2661087051724977, "grad_norm": 1.8979489566621042, "learning_rate": 3.409174749605385e-05, "loss": 0.4195, "step": 3418 }, { "epoch": 0.26618656026470733, "grad_norm": 2.0253831939593305, "learning_rate": 3.408820478606575e-05, "loss": 0.4228, "step": 3419 }, { "epoch": 0.2662644153569169, "grad_norm": 1.8532407739200698, "learning_rate": 3.408466119844873e-05, "loss": 0.3682, "step": 3420 }, { "epoch": 0.26634227044912656, "grad_norm": 1.8735643018710466, "learning_rate": 3.4081116733423506e-05, "loss": 0.3979, "step": 3421 }, { "epoch": 0.2664201255413362, "grad_norm": 2.1276635783303512, "learning_rate": 3.407757139121091e-05, "loss": 0.4814, "step": 3422 }, { "epoch": 0.2664979806335458, "grad_norm": 1.9177030271965534, "learning_rate": 3.4074025172031776e-05, "loss": 0.437, "step": 3423 }, { "epoch": 0.26657583572575544, "grad_norm": 1.941355010321519, "learning_rate": 3.407047807610703e-05, "loss": 0.4467, "step": 3424 }, { "epoch": 0.2666536908179651, "grad_norm": 1.9615697420045348, "learning_rate": 3.406693010365764e-05, "loss": 0.4609, "step": 3425 }, { "epoch": 0.26673154591017467, "grad_norm": 1.9049314536091204, "learning_rate": 3.4063381254904616e-05, "loss": 0.4337, "step": 3426 }, { "epoch": 0.2668094010023843, "grad_norm": 1.9165670092262221, "learning_rate": 3.405983153006906e-05, "loss": 0.4099, "step": 3427 }, { "epoch": 0.26688725609459396, "grad_norm": 1.9875672462443723, "learning_rate": 3.405628092937208e-05, "loss": 0.445, "step": 3428 }, { "epoch": 0.26696511118680355, "grad_norm": 1.9693949371677268, "learning_rate": 3.405272945303486e-05, "loss": 0.4304, "step": 3429 }, { "epoch": 0.2670429662790132, "grad_norm": 1.911539973751559, "learning_rate": 3.4049177101278655e-05, "loss": 0.4057, "step": 3430 }, { "epoch": 0.26712082137122284, "grad_norm": 1.87628310340056, "learning_rate": 3.404562387432475e-05, "loss": 0.4071, "step": 3431 }, { "epoch": 0.2671986764634324, "grad_norm": 1.7181375797587692, "learning_rate": 3.404206977239451e-05, "loss": 0.359, "step": 3432 }, { "epoch": 0.26727653155564207, "grad_norm": 1.9345191060060685, "learning_rate": 3.403851479570931e-05, "loss": 0.4267, "step": 3433 }, { "epoch": 0.26735438664785166, "grad_norm": 1.8639375236796487, "learning_rate": 3.403495894449064e-05, "loss": 0.427, "step": 3434 }, { "epoch": 0.2674322417400613, "grad_norm": 1.8888496374518076, "learning_rate": 3.403140221895999e-05, "loss": 0.4262, "step": 3435 }, { "epoch": 0.26751009683227095, "grad_norm": 1.964727625541197, "learning_rate": 3.402784461933894e-05, "loss": 0.458, "step": 3436 }, { "epoch": 0.26758795192448054, "grad_norm": 1.8097592516169014, "learning_rate": 3.40242861458491e-05, "loss": 0.3925, "step": 3437 }, { "epoch": 0.2676658070166902, "grad_norm": 1.8748937716529053, "learning_rate": 3.402072679871215e-05, "loss": 0.3917, "step": 3438 }, { "epoch": 0.2677436621088998, "grad_norm": 1.8368041055931839, "learning_rate": 3.4017166578149837e-05, "loss": 0.4259, "step": 3439 }, { "epoch": 0.2678215172011094, "grad_norm": 2.105797963697063, "learning_rate": 3.4013605484383925e-05, "loss": 0.416, "step": 3440 }, { "epoch": 0.26789937229331906, "grad_norm": 1.8571769070686048, "learning_rate": 3.401004351763626e-05, "loss": 0.4244, "step": 3441 }, { "epoch": 0.2679772273855287, "grad_norm": 2.014862819111163, "learning_rate": 3.4006480678128735e-05, "loss": 0.4775, "step": 3442 }, { "epoch": 0.2680550824777383, "grad_norm": 1.9797903855160042, "learning_rate": 3.400291696608331e-05, "loss": 0.4299, "step": 3443 }, { "epoch": 0.26813293756994794, "grad_norm": 1.9098821084164752, "learning_rate": 3.3999352381721964e-05, "loss": 0.424, "step": 3444 }, { "epoch": 0.2682107926621576, "grad_norm": 1.9923119184240394, "learning_rate": 3.399578692526678e-05, "loss": 0.4299, "step": 3445 }, { "epoch": 0.26828864775436717, "grad_norm": 1.7671761887827204, "learning_rate": 3.3992220596939854e-05, "loss": 0.3858, "step": 3446 }, { "epoch": 0.2683665028465768, "grad_norm": 2.035637071195091, "learning_rate": 3.398865339696334e-05, "loss": 0.4505, "step": 3447 }, { "epoch": 0.26844435793878646, "grad_norm": 2.0275996383107087, "learning_rate": 3.3985085325559485e-05, "loss": 0.4567, "step": 3448 }, { "epoch": 0.26852221303099605, "grad_norm": 1.7270451817965977, "learning_rate": 3.3981516382950545e-05, "loss": 0.4005, "step": 3449 }, { "epoch": 0.2686000681232057, "grad_norm": 1.8328607972829742, "learning_rate": 3.397794656935886e-05, "loss": 0.4216, "step": 3450 }, { "epoch": 0.2686000681232057, "eval_loss": 0.05395922437310219, "eval_runtime": 167.5526, "eval_samples_per_second": 17.189, "eval_steps_per_second": 0.615, "step": 3450 }, { "epoch": 0.2686779232154153, "grad_norm": 2.1457346039153977, "learning_rate": 3.3974375885006806e-05, "loss": 0.4691, "step": 3451 }, { "epoch": 0.2687557783076249, "grad_norm": 1.7787792666311635, "learning_rate": 3.3970804330116815e-05, "loss": 0.4073, "step": 3452 }, { "epoch": 0.26883363339983457, "grad_norm": 2.029442159428784, "learning_rate": 3.396723190491139e-05, "loss": 0.4157, "step": 3453 }, { "epoch": 0.26891148849204416, "grad_norm": 1.941208375612921, "learning_rate": 3.3963658609613054e-05, "loss": 0.4429, "step": 3454 }, { "epoch": 0.2689893435842538, "grad_norm": 1.8056540788833064, "learning_rate": 3.396008444444443e-05, "loss": 0.4123, "step": 3455 }, { "epoch": 0.26906719867646345, "grad_norm": 1.9028058778581693, "learning_rate": 3.3956509409628166e-05, "loss": 0.403, "step": 3456 }, { "epoch": 0.26914505376867304, "grad_norm": 1.807249802166334, "learning_rate": 3.395293350538696e-05, "loss": 0.4106, "step": 3457 }, { "epoch": 0.2692229088608827, "grad_norm": 1.803834407178205, "learning_rate": 3.3949356731943596e-05, "loss": 0.3807, "step": 3458 }, { "epoch": 0.2693007639530923, "grad_norm": 2.0120748956055063, "learning_rate": 3.394577908952086e-05, "loss": 0.4884, "step": 3459 }, { "epoch": 0.2693786190453019, "grad_norm": 1.8502787805031327, "learning_rate": 3.394220057834164e-05, "loss": 0.4382, "step": 3460 }, { "epoch": 0.26945647413751156, "grad_norm": 1.8219278780111765, "learning_rate": 3.3938621198628865e-05, "loss": 0.435, "step": 3461 }, { "epoch": 0.2695343292297212, "grad_norm": 1.8678302775438473, "learning_rate": 3.39350409506055e-05, "loss": 0.3995, "step": 3462 }, { "epoch": 0.2696121843219308, "grad_norm": 1.9088237130501877, "learning_rate": 3.393145983449459e-05, "loss": 0.4678, "step": 3463 }, { "epoch": 0.26969003941414044, "grad_norm": 1.8852253702683024, "learning_rate": 3.3927877850519214e-05, "loss": 0.4103, "step": 3464 }, { "epoch": 0.2697678945063501, "grad_norm": 1.839457415812944, "learning_rate": 3.392429499890251e-05, "loss": 0.4146, "step": 3465 }, { "epoch": 0.26984574959855967, "grad_norm": 1.8383636683959406, "learning_rate": 3.392071127986769e-05, "loss": 0.4019, "step": 3466 }, { "epoch": 0.2699236046907693, "grad_norm": 2.0863762886689843, "learning_rate": 3.3917126693637976e-05, "loss": 0.4264, "step": 3467 }, { "epoch": 0.2700014597829789, "grad_norm": 2.051312575766241, "learning_rate": 3.3913541240436684e-05, "loss": 0.4543, "step": 3468 }, { "epoch": 0.27007931487518855, "grad_norm": 2.0824582268721126, "learning_rate": 3.390995492048719e-05, "loss": 0.4251, "step": 3469 }, { "epoch": 0.2701571699673982, "grad_norm": 2.031857273611598, "learning_rate": 3.390636773401287e-05, "loss": 0.4282, "step": 3470 }, { "epoch": 0.2702350250596078, "grad_norm": 1.9737254362650347, "learning_rate": 3.390277968123721e-05, "loss": 0.4102, "step": 3471 }, { "epoch": 0.2703128801518174, "grad_norm": 2.015180880182523, "learning_rate": 3.389919076238373e-05, "loss": 0.4469, "step": 3472 }, { "epoch": 0.27039073524402707, "grad_norm": 1.8512103852336181, "learning_rate": 3.389560097767599e-05, "loss": 0.3669, "step": 3473 }, { "epoch": 0.27046859033623666, "grad_norm": 1.8655110444225316, "learning_rate": 3.389201032733763e-05, "loss": 0.4308, "step": 3474 }, { "epoch": 0.2705464454284463, "grad_norm": 1.8638644844852952, "learning_rate": 3.3888418811592325e-05, "loss": 0.4143, "step": 3475 }, { "epoch": 0.27062430052065595, "grad_norm": 1.8802576145708398, "learning_rate": 3.388482643066381e-05, "loss": 0.3827, "step": 3476 }, { "epoch": 0.27070215561286554, "grad_norm": 2.1129540666156754, "learning_rate": 3.388123318477587e-05, "loss": 0.4771, "step": 3477 }, { "epoch": 0.2707800107050752, "grad_norm": 1.8580703137862913, "learning_rate": 3.387763907415234e-05, "loss": 0.4392, "step": 3478 }, { "epoch": 0.2708578657972848, "grad_norm": 2.042394349134297, "learning_rate": 3.387404409901715e-05, "loss": 0.4542, "step": 3479 }, { "epoch": 0.2709357208894944, "grad_norm": 1.9957194445787656, "learning_rate": 3.3870448259594204e-05, "loss": 0.4075, "step": 3480 }, { "epoch": 0.27101357598170406, "grad_norm": 1.7793784614515356, "learning_rate": 3.386685155610754e-05, "loss": 0.3833, "step": 3481 }, { "epoch": 0.2710914310739137, "grad_norm": 1.9774606818524634, "learning_rate": 3.3863253988781203e-05, "loss": 0.4362, "step": 3482 }, { "epoch": 0.2711692861661233, "grad_norm": 2.074978583225716, "learning_rate": 3.38596555578393e-05, "loss": 0.421, "step": 3483 }, { "epoch": 0.27124714125833294, "grad_norm": 1.9342113849388831, "learning_rate": 3.385605626350601e-05, "loss": 0.4557, "step": 3484 }, { "epoch": 0.2713249963505426, "grad_norm": 1.9441591817196786, "learning_rate": 3.385245610600554e-05, "loss": 0.4284, "step": 3485 }, { "epoch": 0.27140285144275217, "grad_norm": 1.8858664147336726, "learning_rate": 3.384885508556217e-05, "loss": 0.4284, "step": 3486 }, { "epoch": 0.2714807065349618, "grad_norm": 1.8957863967256268, "learning_rate": 3.3845253202400214e-05, "loss": 0.3962, "step": 3487 }, { "epoch": 0.2715585616271714, "grad_norm": 2.1270257237523094, "learning_rate": 3.384165045674407e-05, "loss": 0.4534, "step": 3488 }, { "epoch": 0.27163641671938105, "grad_norm": 1.9819157464752164, "learning_rate": 3.3838046848818163e-05, "loss": 0.4507, "step": 3489 }, { "epoch": 0.2717142718115907, "grad_norm": 1.9606266270152417, "learning_rate": 3.3834442378846976e-05, "loss": 0.4256, "step": 3490 }, { "epoch": 0.2717921269038003, "grad_norm": 1.8543907428212199, "learning_rate": 3.383083704705507e-05, "loss": 0.4186, "step": 3491 }, { "epoch": 0.2718699819960099, "grad_norm": 2.0466424590489787, "learning_rate": 3.382723085366701e-05, "loss": 0.4102, "step": 3492 }, { "epoch": 0.27194783708821957, "grad_norm": 2.0118798120580275, "learning_rate": 3.382362379890747e-05, "loss": 0.4195, "step": 3493 }, { "epoch": 0.27202569218042916, "grad_norm": 1.9666230319484839, "learning_rate": 3.382001588300115e-05, "loss": 0.4584, "step": 3494 }, { "epoch": 0.2721035472726388, "grad_norm": 1.902957260518171, "learning_rate": 3.381640710617279e-05, "loss": 0.4164, "step": 3495 }, { "epoch": 0.27218140236484845, "grad_norm": 2.088637362275284, "learning_rate": 3.381279746864722e-05, "loss": 0.448, "step": 3496 }, { "epoch": 0.27225925745705803, "grad_norm": 1.8902473846014416, "learning_rate": 3.3809186970649284e-05, "loss": 0.4116, "step": 3497 }, { "epoch": 0.2723371125492677, "grad_norm": 1.9543321431364815, "learning_rate": 3.380557561240391e-05, "loss": 0.4117, "step": 3498 }, { "epoch": 0.2724149676414773, "grad_norm": 1.933851842004184, "learning_rate": 3.380196339413608e-05, "loss": 0.4278, "step": 3499 }, { "epoch": 0.2724928227336869, "grad_norm": 1.9980139347446888, "learning_rate": 3.3798350316070786e-05, "loss": 0.4184, "step": 3500 }, { "epoch": 0.2724928227336869, "eval_loss": 0.05262576416134834, "eval_runtime": 166.7918, "eval_samples_per_second": 17.267, "eval_steps_per_second": 0.618, "step": 3500 }, { "epoch": 0.27257067782589656, "grad_norm": 1.8580655349534037, "learning_rate": 3.3794736378433133e-05, "loss": 0.4122, "step": 3501 }, { "epoch": 0.2726485329181062, "grad_norm": 1.9005902414765707, "learning_rate": 3.379112158144825e-05, "loss": 0.3753, "step": 3502 }, { "epoch": 0.2727263880103158, "grad_norm": 1.9722256878651687, "learning_rate": 3.3787505925341314e-05, "loss": 0.4091, "step": 3503 }, { "epoch": 0.27280424310252543, "grad_norm": 1.8492422317309858, "learning_rate": 3.378388941033756e-05, "loss": 0.4304, "step": 3504 }, { "epoch": 0.272882098194735, "grad_norm": 1.9875917776393086, "learning_rate": 3.378027203666229e-05, "loss": 0.4479, "step": 3505 }, { "epoch": 0.27295995328694467, "grad_norm": 1.8485569229581003, "learning_rate": 3.377665380454085e-05, "loss": 0.4092, "step": 3506 }, { "epoch": 0.2730378083791543, "grad_norm": 2.1006889713033763, "learning_rate": 3.377303471419863e-05, "loss": 0.4519, "step": 3507 }, { "epoch": 0.2731156634713639, "grad_norm": 1.8246814679434968, "learning_rate": 3.376941476586108e-05, "loss": 0.3773, "step": 3508 }, { "epoch": 0.27319351856357355, "grad_norm": 1.8043718156573052, "learning_rate": 3.376579395975372e-05, "loss": 0.3929, "step": 3509 }, { "epoch": 0.2732713736557832, "grad_norm": 1.8303610066726141, "learning_rate": 3.3762172296102095e-05, "loss": 0.3848, "step": 3510 }, { "epoch": 0.2733492287479928, "grad_norm": 1.8632142308827202, "learning_rate": 3.375854977513183e-05, "loss": 0.3894, "step": 3511 }, { "epoch": 0.2734270838402024, "grad_norm": 1.9218007912342454, "learning_rate": 3.375492639706859e-05, "loss": 0.384, "step": 3512 }, { "epoch": 0.27350493893241207, "grad_norm": 1.9586711036506022, "learning_rate": 3.375130216213807e-05, "loss": 0.4525, "step": 3513 }, { "epoch": 0.27358279402462166, "grad_norm": 1.9219588251018853, "learning_rate": 3.3747677070566084e-05, "loss": 0.3999, "step": 3514 }, { "epoch": 0.2736606491168313, "grad_norm": 1.969052485022252, "learning_rate": 3.374405112257843e-05, "loss": 0.3896, "step": 3515 }, { "epoch": 0.27373850420904094, "grad_norm": 1.809260832393465, "learning_rate": 3.3740424318400985e-05, "loss": 0.3971, "step": 3516 }, { "epoch": 0.27381635930125053, "grad_norm": 1.8767308593294423, "learning_rate": 3.3736796658259696e-05, "loss": 0.4046, "step": 3517 }, { "epoch": 0.2738942143934602, "grad_norm": 1.7425342391286625, "learning_rate": 3.373316814238054e-05, "loss": 0.3971, "step": 3518 }, { "epoch": 0.2739720694856698, "grad_norm": 1.731738687126442, "learning_rate": 3.372953877098957e-05, "loss": 0.3737, "step": 3519 }, { "epoch": 0.2740499245778794, "grad_norm": 1.9472454182446615, "learning_rate": 3.372590854431286e-05, "loss": 0.3817, "step": 3520 }, { "epoch": 0.27412777967008906, "grad_norm": 1.9832194028702166, "learning_rate": 3.372227746257657e-05, "loss": 0.4023, "step": 3521 }, { "epoch": 0.27420563476229864, "grad_norm": 1.8048313654263044, "learning_rate": 3.3718645526006894e-05, "loss": 0.3586, "step": 3522 }, { "epoch": 0.2742834898545083, "grad_norm": 2.0875677157431216, "learning_rate": 3.371501273483008e-05, "loss": 0.4093, "step": 3523 }, { "epoch": 0.27436134494671793, "grad_norm": 2.0283353324159172, "learning_rate": 3.371137908927245e-05, "loss": 0.4136, "step": 3524 }, { "epoch": 0.2744392000389275, "grad_norm": 1.8867538542517652, "learning_rate": 3.3707744589560345e-05, "loss": 0.4177, "step": 3525 }, { "epoch": 0.27451705513113717, "grad_norm": 1.9617934713969214, "learning_rate": 3.3704109235920176e-05, "loss": 0.4133, "step": 3526 }, { "epoch": 0.2745949102233468, "grad_norm": 1.9039364472388667, "learning_rate": 3.3700473028578424e-05, "loss": 0.3882, "step": 3527 }, { "epoch": 0.2746727653155564, "grad_norm": 1.8744405152227013, "learning_rate": 3.3696835967761605e-05, "loss": 0.3938, "step": 3528 }, { "epoch": 0.27475062040776604, "grad_norm": 1.7647949811030936, "learning_rate": 3.3693198053696284e-05, "loss": 0.3847, "step": 3529 }, { "epoch": 0.2748284754999757, "grad_norm": 1.8152941932391646, "learning_rate": 3.368955928660908e-05, "loss": 0.3866, "step": 3530 }, { "epoch": 0.2749063305921853, "grad_norm": 1.8656765166222484, "learning_rate": 3.3685919666726684e-05, "loss": 0.4239, "step": 3531 }, { "epoch": 0.2749841856843949, "grad_norm": 2.05246423460076, "learning_rate": 3.3682279194275826e-05, "loss": 0.4203, "step": 3532 }, { "epoch": 0.27506204077660457, "grad_norm": 1.91688745976681, "learning_rate": 3.3678637869483285e-05, "loss": 0.41, "step": 3533 }, { "epoch": 0.27513989586881415, "grad_norm": 2.0356955145168314, "learning_rate": 3.36749956925759e-05, "loss": 0.4044, "step": 3534 }, { "epoch": 0.2752177509610238, "grad_norm": 1.8682464794029365, "learning_rate": 3.367135266378056e-05, "loss": 0.3713, "step": 3535 }, { "epoch": 0.27529560605323344, "grad_norm": 1.7869442658524406, "learning_rate": 3.366770878332421e-05, "loss": 0.3615, "step": 3536 }, { "epoch": 0.27537346114544303, "grad_norm": 1.7863960136185542, "learning_rate": 3.366406405143384e-05, "loss": 0.3822, "step": 3537 }, { "epoch": 0.2754513162376527, "grad_norm": 1.996145448634306, "learning_rate": 3.366041846833652e-05, "loss": 0.4343, "step": 3538 }, { "epoch": 0.2755291713298623, "grad_norm": 1.8890843208883115, "learning_rate": 3.365677203425933e-05, "loss": 0.4175, "step": 3539 }, { "epoch": 0.2756070264220719, "grad_norm": 1.9069957127957728, "learning_rate": 3.365312474942944e-05, "loss": 0.3858, "step": 3540 }, { "epoch": 0.27568488151428155, "grad_norm": 1.9391136452611661, "learning_rate": 3.364947661407404e-05, "loss": 0.3614, "step": 3541 }, { "epoch": 0.27576273660649114, "grad_norm": 1.862543873336594, "learning_rate": 3.364582762842043e-05, "loss": 0.3874, "step": 3542 }, { "epoch": 0.2758405916987008, "grad_norm": 1.8463656436860925, "learning_rate": 3.364217779269588e-05, "loss": 0.4094, "step": 3543 }, { "epoch": 0.27591844679091043, "grad_norm": 1.8255566549114997, "learning_rate": 3.363852710712778e-05, "loss": 0.3829, "step": 3544 }, { "epoch": 0.27599630188312, "grad_norm": 1.916359716314808, "learning_rate": 3.363487557194355e-05, "loss": 0.3941, "step": 3545 }, { "epoch": 0.27607415697532967, "grad_norm": 1.6643338108037036, "learning_rate": 3.363122318737067e-05, "loss": 0.3621, "step": 3546 }, { "epoch": 0.2761520120675393, "grad_norm": 1.881118300865761, "learning_rate": 3.362756995363665e-05, "loss": 0.4145, "step": 3547 }, { "epoch": 0.2762298671597489, "grad_norm": 1.7898873949256353, "learning_rate": 3.362391587096907e-05, "loss": 0.4105, "step": 3548 }, { "epoch": 0.27630772225195854, "grad_norm": 2.118844299263188, "learning_rate": 3.362026093959558e-05, "loss": 0.4371, "step": 3549 }, { "epoch": 0.2763855773441682, "grad_norm": 1.9527455453309384, "learning_rate": 3.361660515974385e-05, "loss": 0.3845, "step": 3550 }, { "epoch": 0.2763855773441682, "eval_loss": 0.05106492340564728, "eval_runtime": 167.2927, "eval_samples_per_second": 17.215, "eval_steps_per_second": 0.616, "step": 3550 }, { "epoch": 0.2764634324363778, "grad_norm": 1.9111517933270306, "learning_rate": 3.361294853164163e-05, "loss": 0.3744, "step": 3551 }, { "epoch": 0.2765412875285874, "grad_norm": 1.9520476821975072, "learning_rate": 3.3609291055516686e-05, "loss": 0.3935, "step": 3552 }, { "epoch": 0.27661914262079706, "grad_norm": 1.9402264678006282, "learning_rate": 3.3605632731596895e-05, "loss": 0.3923, "step": 3553 }, { "epoch": 0.27669699771300665, "grad_norm": 2.0768267271188905, "learning_rate": 3.3601973560110125e-05, "loss": 0.4115, "step": 3554 }, { "epoch": 0.2767748528052163, "grad_norm": 2.159493082780698, "learning_rate": 3.3598313541284345e-05, "loss": 0.4446, "step": 3555 }, { "epoch": 0.27685270789742594, "grad_norm": 1.8982917278244058, "learning_rate": 3.359465267534755e-05, "loss": 0.3928, "step": 3556 }, { "epoch": 0.27693056298963553, "grad_norm": 1.8744106491587302, "learning_rate": 3.359099096252779e-05, "loss": 0.398, "step": 3557 }, { "epoch": 0.2770084180818452, "grad_norm": 1.97173508277818, "learning_rate": 3.358732840305318e-05, "loss": 0.4499, "step": 3558 }, { "epoch": 0.27708627317405476, "grad_norm": 1.7907016624722876, "learning_rate": 3.358366499715187e-05, "loss": 0.4168, "step": 3559 }, { "epoch": 0.2771641282662644, "grad_norm": 1.8172320172805567, "learning_rate": 3.3580000745052085e-05, "loss": 0.4163, "step": 3560 }, { "epoch": 0.27724198335847405, "grad_norm": 1.790144294513845, "learning_rate": 3.357633564698208e-05, "loss": 0.3706, "step": 3561 }, { "epoch": 0.27731983845068364, "grad_norm": 1.9142615286695788, "learning_rate": 3.357266970317018e-05, "loss": 0.4218, "step": 3562 }, { "epoch": 0.2773976935428933, "grad_norm": 1.8266789839430866, "learning_rate": 3.356900291384477e-05, "loss": 0.3808, "step": 3563 }, { "epoch": 0.27747554863510293, "grad_norm": 1.9782619703860804, "learning_rate": 3.3565335279234234e-05, "loss": 0.4272, "step": 3564 }, { "epoch": 0.2775534037273125, "grad_norm": 2.0658000574992577, "learning_rate": 3.3561666799567086e-05, "loss": 0.4329, "step": 3565 }, { "epoch": 0.27763125881952216, "grad_norm": 1.8537041622840629, "learning_rate": 3.355799747507184e-05, "loss": 0.3895, "step": 3566 }, { "epoch": 0.2777091139117318, "grad_norm": 1.9122532481327692, "learning_rate": 3.355432730597707e-05, "loss": 0.4212, "step": 3567 }, { "epoch": 0.2777869690039414, "grad_norm": 1.9751969582727973, "learning_rate": 3.3550656292511434e-05, "loss": 0.4553, "step": 3568 }, { "epoch": 0.27786482409615104, "grad_norm": 1.725236291907136, "learning_rate": 3.3546984434903604e-05, "loss": 0.3825, "step": 3569 }, { "epoch": 0.2779426791883607, "grad_norm": 1.803719495125812, "learning_rate": 3.354331173338231e-05, "loss": 0.3915, "step": 3570 }, { "epoch": 0.2780205342805703, "grad_norm": 1.7241967663021989, "learning_rate": 3.353963818817636e-05, "loss": 0.3762, "step": 3571 }, { "epoch": 0.2780983893727799, "grad_norm": 1.7686412532781437, "learning_rate": 3.353596379951459e-05, "loss": 0.3703, "step": 3572 }, { "epoch": 0.27817624446498956, "grad_norm": 1.785790625605651, "learning_rate": 3.353228856762591e-05, "loss": 0.3755, "step": 3573 }, { "epoch": 0.27825409955719915, "grad_norm": 1.9808656569535503, "learning_rate": 3.3528612492739254e-05, "loss": 0.4548, "step": 3574 }, { "epoch": 0.2783319546494088, "grad_norm": 1.858090190528937, "learning_rate": 3.3524935575083625e-05, "loss": 0.3659, "step": 3575 }, { "epoch": 0.2784098097416184, "grad_norm": 1.853226690429568, "learning_rate": 3.352125781488809e-05, "loss": 0.4013, "step": 3576 }, { "epoch": 0.27848766483382803, "grad_norm": 1.7966862515944522, "learning_rate": 3.351757921238174e-05, "loss": 0.3447, "step": 3577 }, { "epoch": 0.2785655199260377, "grad_norm": 1.9543613500685166, "learning_rate": 3.351389976779375e-05, "loss": 0.3909, "step": 3578 }, { "epoch": 0.27864337501824726, "grad_norm": 1.8004727840247707, "learning_rate": 3.351021948135333e-05, "loss": 0.3764, "step": 3579 }, { "epoch": 0.2787212301104569, "grad_norm": 1.8481269108285803, "learning_rate": 3.350653835328973e-05, "loss": 0.3792, "step": 3580 }, { "epoch": 0.27879908520266655, "grad_norm": 1.7804184100814444, "learning_rate": 3.3502856383832276e-05, "loss": 0.375, "step": 3581 }, { "epoch": 0.27887694029487614, "grad_norm": 1.909478762922234, "learning_rate": 3.349917357321035e-05, "loss": 0.3984, "step": 3582 }, { "epoch": 0.2789547953870858, "grad_norm": 1.733718458846998, "learning_rate": 3.3495489921653346e-05, "loss": 0.371, "step": 3583 }, { "epoch": 0.27903265047929543, "grad_norm": 1.8428024123021558, "learning_rate": 3.3491805429390766e-05, "loss": 0.3937, "step": 3584 }, { "epoch": 0.279110505571505, "grad_norm": 2.0921746995088952, "learning_rate": 3.3488120096652114e-05, "loss": 0.4129, "step": 3585 }, { "epoch": 0.27918836066371466, "grad_norm": 1.9122097725484728, "learning_rate": 3.348443392366699e-05, "loss": 0.3918, "step": 3586 }, { "epoch": 0.2792662157559243, "grad_norm": 1.967171203530554, "learning_rate": 3.3480746910665015e-05, "loss": 0.4144, "step": 3587 }, { "epoch": 0.2793440708481339, "grad_norm": 1.8682906483732489, "learning_rate": 3.347705905787587e-05, "loss": 0.4551, "step": 3588 }, { "epoch": 0.27942192594034354, "grad_norm": 1.9292109779087117, "learning_rate": 3.347337036552929e-05, "loss": 0.4309, "step": 3589 }, { "epoch": 0.2794997810325532, "grad_norm": 1.7870346425602253, "learning_rate": 3.3469680833855074e-05, "loss": 0.3858, "step": 3590 }, { "epoch": 0.2795776361247628, "grad_norm": 1.7093411616284535, "learning_rate": 3.346599046308306e-05, "loss": 0.3679, "step": 3591 }, { "epoch": 0.2796554912169724, "grad_norm": 1.783601615153453, "learning_rate": 3.346229925344312e-05, "loss": 0.3942, "step": 3592 }, { "epoch": 0.27973334630918206, "grad_norm": 1.9199918548587736, "learning_rate": 3.345860720516522e-05, "loss": 0.3907, "step": 3593 }, { "epoch": 0.27981120140139165, "grad_norm": 2.084342083565114, "learning_rate": 3.3454914318479356e-05, "loss": 0.4508, "step": 3594 }, { "epoch": 0.2798890564936013, "grad_norm": 1.7786242278902025, "learning_rate": 3.345122059361557e-05, "loss": 0.3735, "step": 3595 }, { "epoch": 0.2799669115858109, "grad_norm": 1.784979887868737, "learning_rate": 3.344752603080397e-05, "loss": 0.3709, "step": 3596 }, { "epoch": 0.28004476667802053, "grad_norm": 1.8657307618331431, "learning_rate": 3.3443830630274704e-05, "loss": 0.3655, "step": 3597 }, { "epoch": 0.2801226217702302, "grad_norm": 1.9279396886275806, "learning_rate": 3.344013439225799e-05, "loss": 0.3941, "step": 3598 }, { "epoch": 0.28020047686243976, "grad_norm": 1.6931670277951474, "learning_rate": 3.343643731698407e-05, "loss": 0.3525, "step": 3599 }, { "epoch": 0.2802783319546494, "grad_norm": 1.81078059115167, "learning_rate": 3.343273940468326e-05, "loss": 0.3841, "step": 3600 }, { "epoch": 0.2802783319546494, "eval_loss": 0.050073038786649704, "eval_runtime": 166.8743, "eval_samples_per_second": 17.258, "eval_steps_per_second": 0.617, "step": 3600 }, { "epoch": 0.28035618704685905, "grad_norm": 1.6510674439839788, "learning_rate": 3.342904065558593e-05, "loss": 0.3378, "step": 3601 }, { "epoch": 0.28043404213906864, "grad_norm": 1.6921247068228782, "learning_rate": 3.342534106992249e-05, "loss": 0.3496, "step": 3602 }, { "epoch": 0.2805118972312783, "grad_norm": 1.7532643598529452, "learning_rate": 3.3421640647923394e-05, "loss": 0.3756, "step": 3603 }, { "epoch": 0.28058975232348793, "grad_norm": 2.092569346315904, "learning_rate": 3.341793938981918e-05, "loss": 0.4214, "step": 3604 }, { "epoch": 0.2806676074156975, "grad_norm": 2.1638198206382526, "learning_rate": 3.341423729584042e-05, "loss": 0.4613, "step": 3605 }, { "epoch": 0.28074546250790716, "grad_norm": 1.8290865633298252, "learning_rate": 3.3410534366217725e-05, "loss": 0.408, "step": 3606 }, { "epoch": 0.2808233176001168, "grad_norm": 1.754778635552501, "learning_rate": 3.3406830601181765e-05, "loss": 0.3495, "step": 3607 }, { "epoch": 0.2809011726923264, "grad_norm": 1.886390735266146, "learning_rate": 3.340312600096328e-05, "loss": 0.3735, "step": 3608 }, { "epoch": 0.28097902778453604, "grad_norm": 1.8438277621183259, "learning_rate": 3.339942056579306e-05, "loss": 0.3818, "step": 3609 }, { "epoch": 0.2810568828767457, "grad_norm": 1.8146308274448195, "learning_rate": 3.339571429590191e-05, "loss": 0.376, "step": 3610 }, { "epoch": 0.2811347379689553, "grad_norm": 1.9158191784385474, "learning_rate": 3.3392007191520716e-05, "loss": 0.4247, "step": 3611 }, { "epoch": 0.2812125930611649, "grad_norm": 1.9384212475158271, "learning_rate": 3.338829925288044e-05, "loss": 0.4012, "step": 3612 }, { "epoch": 0.2812904481533745, "grad_norm": 1.9096149750207594, "learning_rate": 3.338459048021203e-05, "loss": 0.3585, "step": 3613 }, { "epoch": 0.28136830324558415, "grad_norm": 1.7403527420694087, "learning_rate": 3.338088087374656e-05, "loss": 0.3501, "step": 3614 }, { "epoch": 0.2814461583377938, "grad_norm": 1.8312854073472065, "learning_rate": 3.337717043371511e-05, "loss": 0.399, "step": 3615 }, { "epoch": 0.2815240134300034, "grad_norm": 1.9801214025800429, "learning_rate": 3.337345916034881e-05, "loss": 0.3725, "step": 3616 }, { "epoch": 0.28160186852221303, "grad_norm": 1.9028955965562164, "learning_rate": 3.336974705387887e-05, "loss": 0.381, "step": 3617 }, { "epoch": 0.2816797236144227, "grad_norm": 1.8899657532294307, "learning_rate": 3.336603411453654e-05, "loss": 0.3807, "step": 3618 }, { "epoch": 0.28175757870663226, "grad_norm": 1.8839515016389237, "learning_rate": 3.33623203425531e-05, "loss": 0.4028, "step": 3619 }, { "epoch": 0.2818354337988419, "grad_norm": 1.891754874573902, "learning_rate": 3.3358605738159916e-05, "loss": 0.4016, "step": 3620 }, { "epoch": 0.28191328889105155, "grad_norm": 1.8250417713306595, "learning_rate": 3.3354890301588386e-05, "loss": 0.3747, "step": 3621 }, { "epoch": 0.28199114398326114, "grad_norm": 1.9646740121212445, "learning_rate": 3.335117403306996e-05, "loss": 0.3948, "step": 3622 }, { "epoch": 0.2820689990754708, "grad_norm": 1.7521742164618908, "learning_rate": 3.334745693283615e-05, "loss": 0.3589, "step": 3623 }, { "epoch": 0.2821468541676804, "grad_norm": 2.0855297421824255, "learning_rate": 3.334373900111851e-05, "loss": 0.4015, "step": 3624 }, { "epoch": 0.28222470925989, "grad_norm": 1.837245868564973, "learning_rate": 3.334002023814866e-05, "loss": 0.3649, "step": 3625 }, { "epoch": 0.28230256435209966, "grad_norm": 1.8790278579386772, "learning_rate": 3.333630064415824e-05, "loss": 0.4041, "step": 3626 }, { "epoch": 0.2823804194443093, "grad_norm": 1.7321877509288952, "learning_rate": 3.333258021937898e-05, "loss": 0.3792, "step": 3627 }, { "epoch": 0.2824582745365189, "grad_norm": 1.80187326651331, "learning_rate": 3.332885896404264e-05, "loss": 0.3549, "step": 3628 }, { "epoch": 0.28253612962872854, "grad_norm": 1.9245314612556592, "learning_rate": 3.332513687838104e-05, "loss": 0.3876, "step": 3629 }, { "epoch": 0.2826139847209381, "grad_norm": 1.780589865473518, "learning_rate": 3.332141396262605e-05, "loss": 0.3856, "step": 3630 }, { "epoch": 0.28269183981314777, "grad_norm": 1.909176997761319, "learning_rate": 3.331769021700958e-05, "loss": 0.4241, "step": 3631 }, { "epoch": 0.2827696949053574, "grad_norm": 1.939055575095557, "learning_rate": 3.331396564176361e-05, "loss": 0.4449, "step": 3632 }, { "epoch": 0.282847549997567, "grad_norm": 1.9056808437408694, "learning_rate": 3.331024023712016e-05, "loss": 0.3796, "step": 3633 }, { "epoch": 0.28292540508977665, "grad_norm": 1.8470679027061108, "learning_rate": 3.3306514003311305e-05, "loss": 0.397, "step": 3634 }, { "epoch": 0.2830032601819863, "grad_norm": 1.9584311010491788, "learning_rate": 3.330278694056918e-05, "loss": 0.4368, "step": 3635 }, { "epoch": 0.2830811152741959, "grad_norm": 1.8290424962524994, "learning_rate": 3.329905904912596e-05, "loss": 0.4075, "step": 3636 }, { "epoch": 0.2831589703664055, "grad_norm": 1.8375201767519271, "learning_rate": 3.3295330329213865e-05, "loss": 0.3654, "step": 3637 }, { "epoch": 0.28323682545861517, "grad_norm": 1.9317343212302773, "learning_rate": 3.3291600781065186e-05, "loss": 0.3993, "step": 3638 }, { "epoch": 0.28331468055082476, "grad_norm": 1.8649131550922078, "learning_rate": 3.328787040491226e-05, "loss": 0.3646, "step": 3639 }, { "epoch": 0.2833925356430344, "grad_norm": 1.872821134889288, "learning_rate": 3.3284139200987456e-05, "loss": 0.3826, "step": 3640 }, { "epoch": 0.28347039073524405, "grad_norm": 1.8549021702104143, "learning_rate": 3.328040716952323e-05, "loss": 0.3769, "step": 3641 }, { "epoch": 0.28354824582745364, "grad_norm": 1.9921537073603097, "learning_rate": 3.327667431075205e-05, "loss": 0.3991, "step": 3642 }, { "epoch": 0.2836261009196633, "grad_norm": 1.7422274386509602, "learning_rate": 3.327294062490648e-05, "loss": 0.3471, "step": 3643 }, { "epoch": 0.2837039560118729, "grad_norm": 1.770402509600628, "learning_rate": 3.3269206112219085e-05, "loss": 0.354, "step": 3644 }, { "epoch": 0.2837818111040825, "grad_norm": 1.8677352497988795, "learning_rate": 3.3265470772922524e-05, "loss": 0.3714, "step": 3645 }, { "epoch": 0.28385966619629216, "grad_norm": 1.6935462604029412, "learning_rate": 3.326173460724949e-05, "loss": 0.3553, "step": 3646 }, { "epoch": 0.28393752128850175, "grad_norm": 1.847883987260534, "learning_rate": 3.3257997615432724e-05, "loss": 0.356, "step": 3647 }, { "epoch": 0.2840153763807114, "grad_norm": 1.9832510874367177, "learning_rate": 3.325425979770503e-05, "loss": 0.4122, "step": 3648 }, { "epoch": 0.28409323147292104, "grad_norm": 1.780296443255346, "learning_rate": 3.325052115429924e-05, "loss": 0.3767, "step": 3649 }, { "epoch": 0.2841710865651306, "grad_norm": 1.8031986818945283, "learning_rate": 3.324678168544827e-05, "loss": 0.3642, "step": 3650 }, { "epoch": 0.2841710865651306, "eval_loss": 0.0487583689391613, "eval_runtime": 166.8141, "eval_samples_per_second": 17.265, "eval_steps_per_second": 0.617, "step": 3650 }, { "epoch": 0.28424894165734027, "grad_norm": 1.7917688647777272, "learning_rate": 3.324304139138507e-05, "loss": 0.3522, "step": 3651 }, { "epoch": 0.2843267967495499, "grad_norm": 2.0561936451260925, "learning_rate": 3.323930027234263e-05, "loss": 0.4002, "step": 3652 }, { "epoch": 0.2844046518417595, "grad_norm": 1.9256755220653232, "learning_rate": 3.323555832855403e-05, "loss": 0.3909, "step": 3653 }, { "epoch": 0.28448250693396915, "grad_norm": 1.8895414480703727, "learning_rate": 3.323181556025234e-05, "loss": 0.3836, "step": 3654 }, { "epoch": 0.2845603620261788, "grad_norm": 1.8088540418834107, "learning_rate": 3.322807196767075e-05, "loss": 0.3819, "step": 3655 }, { "epoch": 0.2846382171183884, "grad_norm": 1.824253438381528, "learning_rate": 3.322432755104244e-05, "loss": 0.3473, "step": 3656 }, { "epoch": 0.284716072210598, "grad_norm": 1.9056289679773586, "learning_rate": 3.3220582310600696e-05, "loss": 0.3787, "step": 3657 }, { "epoch": 0.28479392730280767, "grad_norm": 1.8785170244833882, "learning_rate": 3.3216836246578815e-05, "loss": 0.3797, "step": 3658 }, { "epoch": 0.28487178239501726, "grad_norm": 1.7835391520170825, "learning_rate": 3.321308935921016e-05, "loss": 0.4006, "step": 3659 }, { "epoch": 0.2849496374872269, "grad_norm": 1.9138242777486663, "learning_rate": 3.3209341648728134e-05, "loss": 0.3682, "step": 3660 }, { "epoch": 0.28502749257943655, "grad_norm": 1.7875917963938432, "learning_rate": 3.3205593115366226e-05, "loss": 0.3706, "step": 3661 }, { "epoch": 0.28510534767164614, "grad_norm": 1.707835962101531, "learning_rate": 3.320184375935793e-05, "loss": 0.362, "step": 3662 }, { "epoch": 0.2851832027638558, "grad_norm": 1.7217016360275066, "learning_rate": 3.3198093580936826e-05, "loss": 0.3946, "step": 3663 }, { "epoch": 0.2852610578560654, "grad_norm": 1.9399684487740185, "learning_rate": 3.319434258033653e-05, "loss": 0.399, "step": 3664 }, { "epoch": 0.285338912948275, "grad_norm": 2.023271902753076, "learning_rate": 3.319059075779071e-05, "loss": 0.4022, "step": 3665 }, { "epoch": 0.28541676804048466, "grad_norm": 1.8528218535213004, "learning_rate": 3.3186838113533074e-05, "loss": 0.3741, "step": 3666 }, { "epoch": 0.28549462313269425, "grad_norm": 1.9400122135256126, "learning_rate": 3.318308464779742e-05, "loss": 0.3972, "step": 3667 }, { "epoch": 0.2855724782249039, "grad_norm": 1.8508257199876694, "learning_rate": 3.3179330360817555e-05, "loss": 0.3755, "step": 3668 }, { "epoch": 0.28565033331711354, "grad_norm": 1.7835646833642875, "learning_rate": 3.317557525282736e-05, "loss": 0.3953, "step": 3669 }, { "epoch": 0.2857281884093231, "grad_norm": 1.740398017819415, "learning_rate": 3.317181932406076e-05, "loss": 0.3598, "step": 3670 }, { "epoch": 0.28580604350153277, "grad_norm": 1.9256909523917303, "learning_rate": 3.316806257475172e-05, "loss": 0.3903, "step": 3671 }, { "epoch": 0.2858838985937424, "grad_norm": 1.7100884373157934, "learning_rate": 3.3164305005134284e-05, "loss": 0.3387, "step": 3672 }, { "epoch": 0.285961753685952, "grad_norm": 1.8839572009042354, "learning_rate": 3.316054661544252e-05, "loss": 0.3899, "step": 3673 }, { "epoch": 0.28603960877816165, "grad_norm": 1.884872607853927, "learning_rate": 3.315678740591057e-05, "loss": 0.3714, "step": 3674 }, { "epoch": 0.2861174638703713, "grad_norm": 1.8904862478928084, "learning_rate": 3.315302737677259e-05, "loss": 0.4031, "step": 3675 }, { "epoch": 0.2861953189625809, "grad_norm": 1.6660432307027517, "learning_rate": 3.314926652826284e-05, "loss": 0.346, "step": 3676 }, { "epoch": 0.2862731740547905, "grad_norm": 1.8774823095832613, "learning_rate": 3.3145504860615596e-05, "loss": 0.3961, "step": 3677 }, { "epoch": 0.28635102914700017, "grad_norm": 2.055456759196195, "learning_rate": 3.314174237406518e-05, "loss": 0.3891, "step": 3678 }, { "epoch": 0.28642888423920976, "grad_norm": 1.9028231736797934, "learning_rate": 3.3137979068845994e-05, "loss": 0.4217, "step": 3679 }, { "epoch": 0.2865067393314194, "grad_norm": 1.8572602442205088, "learning_rate": 3.313421494519246e-05, "loss": 0.3522, "step": 3680 }, { "epoch": 0.28658459442362905, "grad_norm": 1.8316170934500045, "learning_rate": 3.313045000333907e-05, "loss": 0.3988, "step": 3681 }, { "epoch": 0.28666244951583864, "grad_norm": 1.8623666653669033, "learning_rate": 3.312668424352037e-05, "loss": 0.3748, "step": 3682 }, { "epoch": 0.2867403046080483, "grad_norm": 1.879451282538722, "learning_rate": 3.3122917665970935e-05, "loss": 0.3491, "step": 3683 }, { "epoch": 0.28681815970025787, "grad_norm": 1.7109053781233354, "learning_rate": 3.311915027092542e-05, "loss": 0.3577, "step": 3684 }, { "epoch": 0.2868960147924675, "grad_norm": 1.746100028467469, "learning_rate": 3.3115382058618504e-05, "loss": 0.3888, "step": 3685 }, { "epoch": 0.28697386988467716, "grad_norm": 1.7959791687027544, "learning_rate": 3.3111613029284936e-05, "loss": 0.3623, "step": 3686 }, { "epoch": 0.28705172497688675, "grad_norm": 1.9364268645496094, "learning_rate": 3.310784318315951e-05, "loss": 0.3989, "step": 3687 }, { "epoch": 0.2871295800690964, "grad_norm": 1.8455886402240325, "learning_rate": 3.310407252047705e-05, "loss": 0.3714, "step": 3688 }, { "epoch": 0.28720743516130604, "grad_norm": 1.8537753242266224, "learning_rate": 3.310030104147249e-05, "loss": 0.4001, "step": 3689 }, { "epoch": 0.2872852902535156, "grad_norm": 1.9066604325098933, "learning_rate": 3.3096528746380734e-05, "loss": 0.4176, "step": 3690 }, { "epoch": 0.28736314534572527, "grad_norm": 1.8497960208993893, "learning_rate": 3.30927556354368e-05, "loss": 0.3551, "step": 3691 }, { "epoch": 0.2874410004379349, "grad_norm": 1.8475830482451043, "learning_rate": 3.308898170887574e-05, "loss": 0.3689, "step": 3692 }, { "epoch": 0.2875188555301445, "grad_norm": 1.674325550888163, "learning_rate": 3.308520696693263e-05, "loss": 0.3223, "step": 3693 }, { "epoch": 0.28759671062235415, "grad_norm": 1.7896292395872284, "learning_rate": 3.308143140984264e-05, "loss": 0.3606, "step": 3694 }, { "epoch": 0.2876745657145638, "grad_norm": 1.8862776004054558, "learning_rate": 3.307765503784096e-05, "loss": 0.3865, "step": 3695 }, { "epoch": 0.2877524208067734, "grad_norm": 1.8433131399886578, "learning_rate": 3.3073877851162835e-05, "loss": 0.3912, "step": 3696 }, { "epoch": 0.287830275898983, "grad_norm": 1.9483230642768483, "learning_rate": 3.3070099850043585e-05, "loss": 0.4039, "step": 3697 }, { "epoch": 0.28790813099119267, "grad_norm": 1.7335645787652114, "learning_rate": 3.306632103471854e-05, "loss": 0.3476, "step": 3698 }, { "epoch": 0.28798598608340226, "grad_norm": 1.7717623873188502, "learning_rate": 3.3062541405423114e-05, "loss": 0.3673, "step": 3699 }, { "epoch": 0.2880638411756119, "grad_norm": 1.7362896446284013, "learning_rate": 3.305876096239276e-05, "loss": 0.359, "step": 3700 }, { "epoch": 0.2880638411756119, "eval_loss": 0.04782591387629509, "eval_runtime": 166.9495, "eval_samples_per_second": 17.251, "eval_steps_per_second": 0.617, "step": 3700 }, { "epoch": 0.2881416962678215, "grad_norm": 1.8391022051110022, "learning_rate": 3.305497970586298e-05, "loss": 0.3676, "step": 3701 }, { "epoch": 0.28821955136003113, "grad_norm": 2.0381318116187264, "learning_rate": 3.305119763606932e-05, "loss": 0.425, "step": 3702 }, { "epoch": 0.2882974064522408, "grad_norm": 1.805215323131731, "learning_rate": 3.30474147532474e-05, "loss": 0.3952, "step": 3703 }, { "epoch": 0.28837526154445037, "grad_norm": 1.763871664352099, "learning_rate": 3.304363105763287e-05, "loss": 0.3737, "step": 3704 }, { "epoch": 0.28845311663666, "grad_norm": 1.8134880052918096, "learning_rate": 3.303984654946144e-05, "loss": 0.3728, "step": 3705 }, { "epoch": 0.28853097172886966, "grad_norm": 1.9166789039382663, "learning_rate": 3.303606122896885e-05, "loss": 0.3746, "step": 3706 }, { "epoch": 0.28860882682107925, "grad_norm": 1.9778736571523363, "learning_rate": 3.3032275096390923e-05, "loss": 0.4667, "step": 3707 }, { "epoch": 0.2886866819132889, "grad_norm": 1.6515010770196628, "learning_rate": 3.302848815196352e-05, "loss": 0.3603, "step": 3708 }, { "epoch": 0.28876453700549853, "grad_norm": 1.772747512230801, "learning_rate": 3.302470039592255e-05, "loss": 0.3355, "step": 3709 }, { "epoch": 0.2888423920977081, "grad_norm": 1.8349215561700185, "learning_rate": 3.302091182850396e-05, "loss": 0.346, "step": 3710 }, { "epoch": 0.28892024718991777, "grad_norm": 1.8154599929654083, "learning_rate": 3.3017122449943766e-05, "loss": 0.3797, "step": 3711 }, { "epoch": 0.2889981022821274, "grad_norm": 2.0915330479829586, "learning_rate": 3.3013332260478026e-05, "loss": 0.4357, "step": 3712 }, { "epoch": 0.289075957374337, "grad_norm": 1.8250673948051326, "learning_rate": 3.300954126034286e-05, "loss": 0.3765, "step": 3713 }, { "epoch": 0.28915381246654664, "grad_norm": 1.9080536240594512, "learning_rate": 3.300574944977442e-05, "loss": 0.4092, "step": 3714 }, { "epoch": 0.2892316675587563, "grad_norm": 1.9041285358085056, "learning_rate": 3.300195682900892e-05, "loss": 0.3756, "step": 3715 }, { "epoch": 0.2893095226509659, "grad_norm": 1.7671898379405668, "learning_rate": 3.299816339828263e-05, "loss": 0.3645, "step": 3716 }, { "epoch": 0.2893873777431755, "grad_norm": 1.9488622000944178, "learning_rate": 3.2994369157831856e-05, "loss": 0.354, "step": 3717 }, { "epoch": 0.28946523283538517, "grad_norm": 1.6526203108226893, "learning_rate": 3.299057410789297e-05, "loss": 0.3294, "step": 3718 }, { "epoch": 0.28954308792759476, "grad_norm": 1.792711380235442, "learning_rate": 3.298677824870236e-05, "loss": 0.3953, "step": 3719 }, { "epoch": 0.2896209430198044, "grad_norm": 1.7974529239453723, "learning_rate": 3.298298158049652e-05, "loss": 0.3944, "step": 3720 }, { "epoch": 0.289698798112014, "grad_norm": 1.750845545993764, "learning_rate": 3.2979184103511946e-05, "loss": 0.3576, "step": 3721 }, { "epoch": 0.28977665320422363, "grad_norm": 1.7940738890601307, "learning_rate": 3.297538581798522e-05, "loss": 0.3857, "step": 3722 }, { "epoch": 0.2898545082964333, "grad_norm": 1.8765300603298698, "learning_rate": 3.297158672415294e-05, "loss": 0.3729, "step": 3723 }, { "epoch": 0.28993236338864287, "grad_norm": 1.7423942398236318, "learning_rate": 3.296778682225178e-05, "loss": 0.3616, "step": 3724 }, { "epoch": 0.2900102184808525, "grad_norm": 1.8035944014083227, "learning_rate": 3.296398611251846e-05, "loss": 0.3651, "step": 3725 }, { "epoch": 0.29008807357306216, "grad_norm": 1.7946771615557768, "learning_rate": 3.296018459518973e-05, "loss": 0.3423, "step": 3726 }, { "epoch": 0.29016592866527174, "grad_norm": 1.93114131066112, "learning_rate": 3.2956382270502424e-05, "loss": 0.4209, "step": 3727 }, { "epoch": 0.2902437837574814, "grad_norm": 1.7089267195736708, "learning_rate": 3.295257913869341e-05, "loss": 0.3768, "step": 3728 }, { "epoch": 0.29032163884969103, "grad_norm": 1.9262170771230662, "learning_rate": 3.294877519999959e-05, "loss": 0.4381, "step": 3729 }, { "epoch": 0.2903994939419006, "grad_norm": 1.7260929405303067, "learning_rate": 3.294497045465794e-05, "loss": 0.3279, "step": 3730 }, { "epoch": 0.29047734903411027, "grad_norm": 1.7679984034519316, "learning_rate": 3.294116490290548e-05, "loss": 0.3569, "step": 3731 }, { "epoch": 0.2905552041263199, "grad_norm": 1.6675663999241694, "learning_rate": 3.2937358544979276e-05, "loss": 0.322, "step": 3732 }, { "epoch": 0.2906330592185295, "grad_norm": 1.7442875525109778, "learning_rate": 3.293355138111644e-05, "loss": 0.3533, "step": 3733 }, { "epoch": 0.29071091431073914, "grad_norm": 1.822535344921516, "learning_rate": 3.2929743411554154e-05, "loss": 0.3794, "step": 3734 }, { "epoch": 0.2907887694029488, "grad_norm": 1.7269690662734556, "learning_rate": 3.292593463652963e-05, "loss": 0.3245, "step": 3735 }, { "epoch": 0.2908666244951584, "grad_norm": 1.687515493770866, "learning_rate": 3.2922125056280124e-05, "loss": 0.3156, "step": 3736 }, { "epoch": 0.290944479587368, "grad_norm": 1.8663045349046905, "learning_rate": 3.291831467104297e-05, "loss": 0.3504, "step": 3737 }, { "epoch": 0.2910223346795776, "grad_norm": 1.8307906016871063, "learning_rate": 3.2914503481055537e-05, "loss": 0.374, "step": 3738 }, { "epoch": 0.29110018977178725, "grad_norm": 1.6557508860573267, "learning_rate": 3.291069148655524e-05, "loss": 0.338, "step": 3739 }, { "epoch": 0.2911780448639969, "grad_norm": 1.7907934484393648, "learning_rate": 3.290687868777955e-05, "loss": 0.3588, "step": 3740 }, { "epoch": 0.2912558999562065, "grad_norm": 1.7160971920763095, "learning_rate": 3.290306508496598e-05, "loss": 0.3428, "step": 3741 }, { "epoch": 0.29133375504841613, "grad_norm": 1.880788177807651, "learning_rate": 3.289925067835211e-05, "loss": 0.377, "step": 3742 }, { "epoch": 0.2914116101406258, "grad_norm": 1.8389961315523, "learning_rate": 3.289543546817555e-05, "loss": 0.414, "step": 3743 }, { "epoch": 0.29148946523283537, "grad_norm": 1.7455044113833138, "learning_rate": 3.289161945467398e-05, "loss": 0.388, "step": 3744 }, { "epoch": 0.291567320325045, "grad_norm": 1.7451741493487967, "learning_rate": 3.288780263808511e-05, "loss": 0.3894, "step": 3745 }, { "epoch": 0.29164517541725465, "grad_norm": 1.672166013232833, "learning_rate": 3.288398501864672e-05, "loss": 0.3311, "step": 3746 }, { "epoch": 0.29172303050946424, "grad_norm": 1.7240086996751929, "learning_rate": 3.288016659659662e-05, "loss": 0.3621, "step": 3747 }, { "epoch": 0.2918008856016739, "grad_norm": 1.9141861610092408, "learning_rate": 3.287634737217268e-05, "loss": 0.3837, "step": 3748 }, { "epoch": 0.29187874069388353, "grad_norm": 1.7154779330602992, "learning_rate": 3.287252734561282e-05, "loss": 0.3422, "step": 3749 }, { "epoch": 0.2919565957860931, "grad_norm": 1.7864458458161674, "learning_rate": 3.286870651715502e-05, "loss": 0.3893, "step": 3750 }, { "epoch": 0.2919565957860931, "eval_loss": 0.046693671494722366, "eval_runtime": 163.2471, "eval_samples_per_second": 17.642, "eval_steps_per_second": 0.631, "step": 3750 }, { "epoch": 0.29203445087830276, "grad_norm": 1.7922378494277764, "learning_rate": 3.28648848870373e-05, "loss": 0.385, "step": 3751 }, { "epoch": 0.2921123059705124, "grad_norm": 1.8478891161213162, "learning_rate": 3.2861062455497706e-05, "loss": 0.3503, "step": 3752 }, { "epoch": 0.292190161062722, "grad_norm": 1.6480039698614952, "learning_rate": 3.2857239222774376e-05, "loss": 0.3454, "step": 3753 }, { "epoch": 0.29226801615493164, "grad_norm": 1.6878806280341183, "learning_rate": 3.2853415189105486e-05, "loss": 0.3432, "step": 3754 }, { "epoch": 0.29234587124714123, "grad_norm": 1.8276702547179253, "learning_rate": 3.284959035472923e-05, "loss": 0.3961, "step": 3755 }, { "epoch": 0.2924237263393509, "grad_norm": 1.9305542953264245, "learning_rate": 3.284576471988391e-05, "loss": 0.3607, "step": 3756 }, { "epoch": 0.2925015814315605, "grad_norm": 1.7280583719625193, "learning_rate": 3.284193828480782e-05, "loss": 0.3435, "step": 3757 }, { "epoch": 0.2925794365237701, "grad_norm": 1.8377880005681, "learning_rate": 3.2838111049739337e-05, "loss": 0.3733, "step": 3758 }, { "epoch": 0.29265729161597975, "grad_norm": 1.8050068086891182, "learning_rate": 3.2834283014916886e-05, "loss": 0.3581, "step": 3759 }, { "epoch": 0.2927351467081894, "grad_norm": 1.9458194881211466, "learning_rate": 3.2830454180578925e-05, "loss": 0.3958, "step": 3760 }, { "epoch": 0.292813001800399, "grad_norm": 1.8875259355315825, "learning_rate": 3.282662454696398e-05, "loss": 0.3979, "step": 3761 }, { "epoch": 0.29289085689260863, "grad_norm": 1.8378535201695305, "learning_rate": 3.2822794114310603e-05, "loss": 0.3502, "step": 3762 }, { "epoch": 0.2929687119848183, "grad_norm": 1.8811983773586582, "learning_rate": 3.281896288285744e-05, "loss": 0.3787, "step": 3763 }, { "epoch": 0.29304656707702786, "grad_norm": 1.7110741221559778, "learning_rate": 3.281513085284314e-05, "loss": 0.3317, "step": 3764 }, { "epoch": 0.2931244221692375, "grad_norm": 1.8073604725960202, "learning_rate": 3.281129802450642e-05, "loss": 0.3667, "step": 3765 }, { "epoch": 0.29320227726144715, "grad_norm": 1.6965912912603704, "learning_rate": 3.280746439808606e-05, "loss": 0.2959, "step": 3766 }, { "epoch": 0.29328013235365674, "grad_norm": 1.812339995828348, "learning_rate": 3.280362997382086e-05, "loss": 0.361, "step": 3767 }, { "epoch": 0.2933579874458664, "grad_norm": 1.840395372087058, "learning_rate": 3.27997947519497e-05, "loss": 0.3702, "step": 3768 }, { "epoch": 0.29343584253807603, "grad_norm": 1.7862227383026985, "learning_rate": 3.279595873271149e-05, "loss": 0.3179, "step": 3769 }, { "epoch": 0.2935136976302856, "grad_norm": 1.8984436284795745, "learning_rate": 3.27921219163452e-05, "loss": 0.3709, "step": 3770 }, { "epoch": 0.29359155272249526, "grad_norm": 1.8230886578114267, "learning_rate": 3.2788284303089844e-05, "loss": 0.3553, "step": 3771 }, { "epoch": 0.2936694078147049, "grad_norm": 1.6319436986501763, "learning_rate": 3.278444589318448e-05, "loss": 0.3355, "step": 3772 }, { "epoch": 0.2937472629069145, "grad_norm": 1.9381651161267388, "learning_rate": 3.2780606686868225e-05, "loss": 0.3814, "step": 3773 }, { "epoch": 0.29382511799912414, "grad_norm": 1.6892833226999828, "learning_rate": 3.2776766684380254e-05, "loss": 0.3553, "step": 3774 }, { "epoch": 0.29390297309133373, "grad_norm": 1.7511060531960323, "learning_rate": 3.277292588595978e-05, "loss": 0.3706, "step": 3775 }, { "epoch": 0.2939808281835434, "grad_norm": 1.7289075740310462, "learning_rate": 3.276908429184605e-05, "loss": 0.3898, "step": 3776 }, { "epoch": 0.294058683275753, "grad_norm": 1.8919157131820492, "learning_rate": 3.27652419022784e-05, "loss": 0.3576, "step": 3777 }, { "epoch": 0.2941365383679626, "grad_norm": 1.7855502631231825, "learning_rate": 3.2761398717496174e-05, "loss": 0.37, "step": 3778 }, { "epoch": 0.29421439346017225, "grad_norm": 1.813869679880701, "learning_rate": 3.2757554737738796e-05, "loss": 0.3525, "step": 3779 }, { "epoch": 0.2942922485523819, "grad_norm": 1.8236034395340042, "learning_rate": 3.275370996324572e-05, "loss": 0.3491, "step": 3780 }, { "epoch": 0.2943701036445915, "grad_norm": 1.9909665153944538, "learning_rate": 3.2749864394256464e-05, "loss": 0.384, "step": 3781 }, { "epoch": 0.29444795873680113, "grad_norm": 1.731476081804644, "learning_rate": 3.2746018031010585e-05, "loss": 0.3313, "step": 3782 }, { "epoch": 0.2945258138290108, "grad_norm": 1.772929415693109, "learning_rate": 3.274217087374769e-05, "loss": 0.3594, "step": 3783 }, { "epoch": 0.29460366892122036, "grad_norm": 1.8972520681259144, "learning_rate": 3.273832292270745e-05, "loss": 0.3868, "step": 3784 }, { "epoch": 0.29468152401343, "grad_norm": 1.7825745678441658, "learning_rate": 3.2734474178129564e-05, "loss": 0.3552, "step": 3785 }, { "epoch": 0.29475937910563965, "grad_norm": 1.6434447318572987, "learning_rate": 3.2730624640253794e-05, "loss": 0.3139, "step": 3786 }, { "epoch": 0.29483723419784924, "grad_norm": 1.853136929754412, "learning_rate": 3.272677430931995e-05, "loss": 0.3337, "step": 3787 }, { "epoch": 0.2949150892900589, "grad_norm": 1.6526069660091838, "learning_rate": 3.2722923185567884e-05, "loss": 0.32, "step": 3788 }, { "epoch": 0.29499294438226853, "grad_norm": 1.9164366886176667, "learning_rate": 3.2719071269237504e-05, "loss": 0.39, "step": 3789 }, { "epoch": 0.2950707994744781, "grad_norm": 1.90393619794144, "learning_rate": 3.271521856056878e-05, "loss": 0.3887, "step": 3790 }, { "epoch": 0.29514865456668776, "grad_norm": 1.7944008273320675, "learning_rate": 3.2711365059801696e-05, "loss": 0.3518, "step": 3791 }, { "epoch": 0.29522650965889735, "grad_norm": 1.9393473343169048, "learning_rate": 3.270751076717632e-05, "loss": 0.3676, "step": 3792 }, { "epoch": 0.295304364751107, "grad_norm": 1.6401849104174269, "learning_rate": 3.2703655682932755e-05, "loss": 0.3477, "step": 3793 }, { "epoch": 0.29538221984331664, "grad_norm": 1.7416879803562781, "learning_rate": 3.269979980731115e-05, "loss": 0.3244, "step": 3794 }, { "epoch": 0.29546007493552623, "grad_norm": 1.7992491888095612, "learning_rate": 3.269594314055172e-05, "loss": 0.3411, "step": 3795 }, { "epoch": 0.2955379300277359, "grad_norm": 1.9140460172904403, "learning_rate": 3.2692085682894703e-05, "loss": 0.3914, "step": 3796 }, { "epoch": 0.2956157851199455, "grad_norm": 1.9386329983105728, "learning_rate": 3.268822743458041e-05, "loss": 0.3698, "step": 3797 }, { "epoch": 0.2956936402121551, "grad_norm": 1.956807257208689, "learning_rate": 3.268436839584918e-05, "loss": 0.4049, "step": 3798 }, { "epoch": 0.29577149530436475, "grad_norm": 1.909494123386696, "learning_rate": 3.268050856694143e-05, "loss": 0.373, "step": 3799 }, { "epoch": 0.2958493503965744, "grad_norm": 1.7065375915699745, "learning_rate": 3.267664794809759e-05, "loss": 0.3947, "step": 3800 }, { "epoch": 0.2958493503965744, "eval_loss": 0.045685406774282455, "eval_runtime": 162.3967, "eval_samples_per_second": 17.734, "eval_steps_per_second": 0.634, "step": 3800 }, { "epoch": 0.295927205488784, "grad_norm": 1.8521351416169758, "learning_rate": 3.267278653955817e-05, "loss": 0.3636, "step": 3801 }, { "epoch": 0.29600506058099363, "grad_norm": 1.797893950239753, "learning_rate": 3.2668924341563726e-05, "loss": 0.3389, "step": 3802 }, { "epoch": 0.2960829156732033, "grad_norm": 1.841535794387638, "learning_rate": 3.266506135435483e-05, "loss": 0.3901, "step": 3803 }, { "epoch": 0.29616077076541286, "grad_norm": 1.8433617938105609, "learning_rate": 3.266119757817215e-05, "loss": 0.3687, "step": 3804 }, { "epoch": 0.2962386258576225, "grad_norm": 1.80942579719827, "learning_rate": 3.265733301325638e-05, "loss": 0.3599, "step": 3805 }, { "epoch": 0.29631648094983215, "grad_norm": 1.7730047849507924, "learning_rate": 3.265346765984825e-05, "loss": 0.3489, "step": 3806 }, { "epoch": 0.29639433604204174, "grad_norm": 1.994376540125969, "learning_rate": 3.264960151818857e-05, "loss": 0.4278, "step": 3807 }, { "epoch": 0.2964721911342514, "grad_norm": 1.8897472205782477, "learning_rate": 3.2645734588518164e-05, "loss": 0.4249, "step": 3808 }, { "epoch": 0.296550046226461, "grad_norm": 1.7517905134842375, "learning_rate": 3.264186687107794e-05, "loss": 0.3464, "step": 3809 }, { "epoch": 0.2966279013186706, "grad_norm": 1.7919965582996762, "learning_rate": 3.2637998366108825e-05, "loss": 0.3534, "step": 3810 }, { "epoch": 0.29670575641088026, "grad_norm": 1.7010045887361813, "learning_rate": 3.263412907385182e-05, "loss": 0.3623, "step": 3811 }, { "epoch": 0.29678361150308985, "grad_norm": 1.6821399270659918, "learning_rate": 3.263025899454795e-05, "loss": 0.3233, "step": 3812 }, { "epoch": 0.2968614665952995, "grad_norm": 1.9976896535498083, "learning_rate": 3.262638812843833e-05, "loss": 0.391, "step": 3813 }, { "epoch": 0.29693932168750914, "grad_norm": 1.556504713350421, "learning_rate": 3.2622516475764066e-05, "loss": 0.3224, "step": 3814 }, { "epoch": 0.29701717677971873, "grad_norm": 1.7651849081347168, "learning_rate": 3.261864403676636e-05, "loss": 0.3081, "step": 3815 }, { "epoch": 0.2970950318719284, "grad_norm": 1.6689725833462148, "learning_rate": 3.261477081168644e-05, "loss": 0.3304, "step": 3816 }, { "epoch": 0.297172886964138, "grad_norm": 1.7895348318168087, "learning_rate": 3.26108968007656e-05, "loss": 0.359, "step": 3817 }, { "epoch": 0.2972507420563476, "grad_norm": 1.7738034101870253, "learning_rate": 3.260702200424517e-05, "loss": 0.3448, "step": 3818 }, { "epoch": 0.29732859714855725, "grad_norm": 1.786243153838227, "learning_rate": 3.2603146422366507e-05, "loss": 0.3664, "step": 3819 }, { "epoch": 0.2974064522407669, "grad_norm": 1.6844808283644261, "learning_rate": 3.259927005537108e-05, "loss": 0.3639, "step": 3820 }, { "epoch": 0.2974843073329765, "grad_norm": 1.896528041920386, "learning_rate": 3.2595392903500346e-05, "loss": 0.3574, "step": 3821 }, { "epoch": 0.2975621624251861, "grad_norm": 1.7851156980979135, "learning_rate": 3.259151496699583e-05, "loss": 0.3577, "step": 3822 }, { "epoch": 0.29764001751739577, "grad_norm": 1.6983849999351859, "learning_rate": 3.258763624609913e-05, "loss": 0.3099, "step": 3823 }, { "epoch": 0.29771787260960536, "grad_norm": 1.738028787073023, "learning_rate": 3.258375674105185e-05, "loss": 0.3273, "step": 3824 }, { "epoch": 0.297795727701815, "grad_norm": 1.6705449089222428, "learning_rate": 3.257987645209568e-05, "loss": 0.3068, "step": 3825 }, { "epoch": 0.2978735827940246, "grad_norm": 1.830711518163438, "learning_rate": 3.257599537947233e-05, "loss": 0.3548, "step": 3826 }, { "epoch": 0.29795143788623424, "grad_norm": 1.86780833885804, "learning_rate": 3.257211352342359e-05, "loss": 0.338, "step": 3827 }, { "epoch": 0.2980292929784439, "grad_norm": 1.9463279269444649, "learning_rate": 3.2568230884191266e-05, "loss": 0.352, "step": 3828 }, { "epoch": 0.29810714807065347, "grad_norm": 1.693275862378501, "learning_rate": 3.2564347462017236e-05, "loss": 0.3446, "step": 3829 }, { "epoch": 0.2981850031628631, "grad_norm": 1.6997005165511911, "learning_rate": 3.2560463257143414e-05, "loss": 0.3497, "step": 3830 }, { "epoch": 0.29826285825507276, "grad_norm": 1.8965557986385342, "learning_rate": 3.255657826981177e-05, "loss": 0.4451, "step": 3831 }, { "epoch": 0.29834071334728235, "grad_norm": 1.721581698789192, "learning_rate": 3.255269250026432e-05, "loss": 0.3764, "step": 3832 }, { "epoch": 0.298418568439492, "grad_norm": 1.6440016779932916, "learning_rate": 3.2548805948743135e-05, "loss": 0.3258, "step": 3833 }, { "epoch": 0.29849642353170164, "grad_norm": 1.717220295629973, "learning_rate": 3.2544918615490315e-05, "loss": 0.322, "step": 3834 }, { "epoch": 0.2985742786239112, "grad_norm": 1.8894525472009698, "learning_rate": 3.254103050074804e-05, "loss": 0.3766, "step": 3835 }, { "epoch": 0.29865213371612087, "grad_norm": 1.7036049327131402, "learning_rate": 3.253714160475851e-05, "loss": 0.3497, "step": 3836 }, { "epoch": 0.2987299888083305, "grad_norm": 1.8541061454048824, "learning_rate": 3.253325192776399e-05, "loss": 0.3585, "step": 3837 }, { "epoch": 0.2988078439005401, "grad_norm": 1.8133099565450546, "learning_rate": 3.2529361470006786e-05, "loss": 0.3098, "step": 3838 }, { "epoch": 0.29888569899274975, "grad_norm": 1.9860987896951714, "learning_rate": 3.252547023172925e-05, "loss": 0.3651, "step": 3839 }, { "epoch": 0.2989635540849594, "grad_norm": 1.7209941436087526, "learning_rate": 3.25215782131738e-05, "loss": 0.3336, "step": 3840 }, { "epoch": 0.299041409177169, "grad_norm": 1.8179192143168894, "learning_rate": 3.2517685414582886e-05, "loss": 0.3987, "step": 3841 }, { "epoch": 0.2991192642693786, "grad_norm": 1.6938856267600315, "learning_rate": 3.2513791836199e-05, "loss": 0.3247, "step": 3842 }, { "epoch": 0.29919711936158827, "grad_norm": 1.6851662587508374, "learning_rate": 3.250989747826471e-05, "loss": 0.3212, "step": 3843 }, { "epoch": 0.29927497445379786, "grad_norm": 1.8543755116499872, "learning_rate": 3.25060023410226e-05, "loss": 0.3532, "step": 3844 }, { "epoch": 0.2993528295460075, "grad_norm": 1.8286317618364525, "learning_rate": 3.250210642471534e-05, "loss": 0.3519, "step": 3845 }, { "epoch": 0.2994306846382171, "grad_norm": 1.8118147727424763, "learning_rate": 3.2498209729585606e-05, "loss": 0.3285, "step": 3846 }, { "epoch": 0.29950853973042674, "grad_norm": 1.9399392278412657, "learning_rate": 3.2494312255876156e-05, "loss": 0.4065, "step": 3847 }, { "epoch": 0.2995863948226364, "grad_norm": 1.9390303881552824, "learning_rate": 3.2490414003829776e-05, "loss": 0.3846, "step": 3848 }, { "epoch": 0.29966424991484597, "grad_norm": 1.8850053946913277, "learning_rate": 3.248651497368933e-05, "loss": 0.4008, "step": 3849 }, { "epoch": 0.2997421050070556, "grad_norm": 1.8505006431765343, "learning_rate": 3.248261516569767e-05, "loss": 0.3767, "step": 3850 }, { "epoch": 0.2997421050070556, "eval_loss": 0.04474029317498207, "eval_runtime": 162.0466, "eval_samples_per_second": 17.773, "eval_steps_per_second": 0.636, "step": 3850 }, { "epoch": 0.29981996009926526, "grad_norm": 1.867776890276072, "learning_rate": 3.2478714580097774e-05, "loss": 0.3486, "step": 3851 }, { "epoch": 0.29989781519147485, "grad_norm": 1.8673564538386618, "learning_rate": 3.2474813217132605e-05, "loss": 0.328, "step": 3852 }, { "epoch": 0.2999756702836845, "grad_norm": 1.6601963521117542, "learning_rate": 3.247091107704522e-05, "loss": 0.33, "step": 3853 }, { "epoch": 0.30005352537589414, "grad_norm": 1.8595537814683987, "learning_rate": 3.246700816007869e-05, "loss": 0.3649, "step": 3854 }, { "epoch": 0.3001313804681037, "grad_norm": 1.6915966757108818, "learning_rate": 3.2463104466476143e-05, "loss": 0.3289, "step": 3855 }, { "epoch": 0.30020923556031337, "grad_norm": 1.7393851560626363, "learning_rate": 3.2459199996480784e-05, "loss": 0.3617, "step": 3856 }, { "epoch": 0.300287090652523, "grad_norm": 1.8650498123739383, "learning_rate": 3.245529475033581e-05, "loss": 0.3421, "step": 3857 }, { "epoch": 0.3003649457447326, "grad_norm": 1.8458333088848706, "learning_rate": 3.245138872828454e-05, "loss": 0.3722, "step": 3858 }, { "epoch": 0.30044280083694225, "grad_norm": 1.6695667771738474, "learning_rate": 3.244748193057026e-05, "loss": 0.3709, "step": 3859 }, { "epoch": 0.3005206559291519, "grad_norm": 1.6110201752928903, "learning_rate": 3.2443574357436375e-05, "loss": 0.3099, "step": 3860 }, { "epoch": 0.3005985110213615, "grad_norm": 1.7543845781344058, "learning_rate": 3.243966600912629e-05, "loss": 0.3488, "step": 3861 }, { "epoch": 0.3006763661135711, "grad_norm": 1.751802949041251, "learning_rate": 3.2435756885883494e-05, "loss": 0.3484, "step": 3862 }, { "epoch": 0.3007542212057807, "grad_norm": 1.8324676027946012, "learning_rate": 3.24318469879515e-05, "loss": 0.3709, "step": 3863 }, { "epoch": 0.30083207629799036, "grad_norm": 1.762653126531246, "learning_rate": 3.242793631557386e-05, "loss": 0.3554, "step": 3864 }, { "epoch": 0.3009099313902, "grad_norm": 1.621623319465054, "learning_rate": 3.2424024868994207e-05, "loss": 0.3104, "step": 3865 }, { "epoch": 0.3009877864824096, "grad_norm": 1.7454899725630668, "learning_rate": 3.242011264845621e-05, "loss": 0.3391, "step": 3866 }, { "epoch": 0.30106564157461924, "grad_norm": 1.8012912063221644, "learning_rate": 3.2416199654203566e-05, "loss": 0.3693, "step": 3867 }, { "epoch": 0.3011434966668289, "grad_norm": 1.8847170519650194, "learning_rate": 3.241228588648005e-05, "loss": 0.3876, "step": 3868 }, { "epoch": 0.30122135175903847, "grad_norm": 1.8040326429366775, "learning_rate": 3.240837134552946e-05, "loss": 0.3486, "step": 3869 }, { "epoch": 0.3012992068512481, "grad_norm": 1.7180364754964916, "learning_rate": 3.240445603159566e-05, "loss": 0.322, "step": 3870 }, { "epoch": 0.30137706194345776, "grad_norm": 1.6783344201317445, "learning_rate": 3.240053994492256e-05, "loss": 0.336, "step": 3871 }, { "epoch": 0.30145491703566735, "grad_norm": 1.7676656549195309, "learning_rate": 3.239662308575412e-05, "loss": 0.3438, "step": 3872 }, { "epoch": 0.301532772127877, "grad_norm": 1.6732965900129222, "learning_rate": 3.239270545433432e-05, "loss": 0.296, "step": 3873 }, { "epoch": 0.30161062722008664, "grad_norm": 1.7137517549439096, "learning_rate": 3.238878705090722e-05, "loss": 0.3215, "step": 3874 }, { "epoch": 0.3016884823122962, "grad_norm": 1.6851225281533955, "learning_rate": 3.238486787571692e-05, "loss": 0.3103, "step": 3875 }, { "epoch": 0.30176633740450587, "grad_norm": 1.8303564371409278, "learning_rate": 3.2380947929007565e-05, "loss": 0.3667, "step": 3876 }, { "epoch": 0.3018441924967155, "grad_norm": 1.7014251472766908, "learning_rate": 3.237702721102334e-05, "loss": 0.3234, "step": 3877 }, { "epoch": 0.3019220475889251, "grad_norm": 1.772983760996319, "learning_rate": 3.2373105722008516e-05, "loss": 0.3599, "step": 3878 }, { "epoch": 0.30199990268113475, "grad_norm": 1.76872877675498, "learning_rate": 3.2369183462207355e-05, "loss": 0.3822, "step": 3879 }, { "epoch": 0.30207775777334434, "grad_norm": 1.693286608888647, "learning_rate": 3.2365260431864204e-05, "loss": 0.334, "step": 3880 }, { "epoch": 0.302155612865554, "grad_norm": 1.6349651731068862, "learning_rate": 3.236133663122345e-05, "loss": 0.2993, "step": 3881 }, { "epoch": 0.3022334679577636, "grad_norm": 1.8559267449433199, "learning_rate": 3.2357412060529526e-05, "loss": 0.3844, "step": 3882 }, { "epoch": 0.3023113230499732, "grad_norm": 1.7965061850424426, "learning_rate": 3.235348672002692e-05, "loss": 0.331, "step": 3883 }, { "epoch": 0.30238917814218286, "grad_norm": 1.7376738074364828, "learning_rate": 3.234956060996016e-05, "loss": 0.3419, "step": 3884 }, { "epoch": 0.3024670332343925, "grad_norm": 1.7448433688930498, "learning_rate": 3.234563373057381e-05, "loss": 0.3111, "step": 3885 }, { "epoch": 0.3025448883266021, "grad_norm": 1.7427299089696053, "learning_rate": 3.234170608211252e-05, "loss": 0.3361, "step": 3886 }, { "epoch": 0.30262274341881173, "grad_norm": 1.615823472227451, "learning_rate": 3.2337777664820945e-05, "loss": 0.3045, "step": 3887 }, { "epoch": 0.3027005985110214, "grad_norm": 1.632732328826991, "learning_rate": 3.233384847894382e-05, "loss": 0.3109, "step": 3888 }, { "epoch": 0.30277845360323097, "grad_norm": 1.8253082395736677, "learning_rate": 3.2329918524725906e-05, "loss": 0.3775, "step": 3889 }, { "epoch": 0.3028563086954406, "grad_norm": 1.7466967938105638, "learning_rate": 3.2325987802412024e-05, "loss": 0.3244, "step": 3890 }, { "epoch": 0.30293416378765026, "grad_norm": 1.7670024698485824, "learning_rate": 3.2322056312247034e-05, "loss": 0.332, "step": 3891 }, { "epoch": 0.30301201887985985, "grad_norm": 1.7740550542589326, "learning_rate": 3.2318124054475864e-05, "loss": 0.3601, "step": 3892 }, { "epoch": 0.3030898739720695, "grad_norm": 1.8780496441073942, "learning_rate": 3.231419102934346e-05, "loss": 0.3841, "step": 3893 }, { "epoch": 0.30316772906427913, "grad_norm": 1.8494744934756624, "learning_rate": 3.231025723709484e-05, "loss": 0.3618, "step": 3894 }, { "epoch": 0.3032455841564887, "grad_norm": 1.7517476167812442, "learning_rate": 3.230632267797505e-05, "loss": 0.323, "step": 3895 }, { "epoch": 0.30332343924869837, "grad_norm": 1.805073381689645, "learning_rate": 3.230238735222921e-05, "loss": 0.393, "step": 3896 }, { "epoch": 0.303401294340908, "grad_norm": 1.7307677017420933, "learning_rate": 3.229845126010246e-05, "loss": 0.3194, "step": 3897 }, { "epoch": 0.3034791494331176, "grad_norm": 1.6174365103872093, "learning_rate": 3.2294514401840005e-05, "loss": 0.3172, "step": 3898 }, { "epoch": 0.30355700452532725, "grad_norm": 1.7161629859437955, "learning_rate": 3.22905767776871e-05, "loss": 0.3255, "step": 3899 }, { "epoch": 0.30363485961753683, "grad_norm": 1.5550640865675496, "learning_rate": 3.228663838788903e-05, "loss": 0.3098, "step": 3900 }, { "epoch": 0.30363485961753683, "eval_loss": 0.04295503720641136, "eval_runtime": 162.8679, "eval_samples_per_second": 17.683, "eval_steps_per_second": 0.632, "step": 3900 }, { "epoch": 0.3037127147097465, "grad_norm": 1.7177503480242609, "learning_rate": 3.228269923269113e-05, "loss": 0.3254, "step": 3901 }, { "epoch": 0.3037905698019561, "grad_norm": 1.7727575431422595, "learning_rate": 3.2278759312338806e-05, "loss": 0.3694, "step": 3902 }, { "epoch": 0.3038684248941657, "grad_norm": 1.8176101390351003, "learning_rate": 3.227481862707749e-05, "loss": 0.3613, "step": 3903 }, { "epoch": 0.30394627998637536, "grad_norm": 1.978431938550392, "learning_rate": 3.227087717715268e-05, "loss": 0.3875, "step": 3904 }, { "epoch": 0.304024135078585, "grad_norm": 1.859031668120942, "learning_rate": 3.22669349628099e-05, "loss": 0.3505, "step": 3905 }, { "epoch": 0.3041019901707946, "grad_norm": 1.696570062373441, "learning_rate": 3.2262991984294726e-05, "loss": 0.3143, "step": 3906 }, { "epoch": 0.30417984526300423, "grad_norm": 1.6423602776274095, "learning_rate": 3.225904824185279e-05, "loss": 0.3365, "step": 3907 }, { "epoch": 0.3042577003552139, "grad_norm": 1.717889420066234, "learning_rate": 3.225510373572977e-05, "loss": 0.3262, "step": 3908 }, { "epoch": 0.30433555544742347, "grad_norm": 1.7724544252203256, "learning_rate": 3.22511584661714e-05, "loss": 0.3493, "step": 3909 }, { "epoch": 0.3044134105396331, "grad_norm": 1.821080234810358, "learning_rate": 3.224721243342344e-05, "loss": 0.3448, "step": 3910 }, { "epoch": 0.30449126563184276, "grad_norm": 1.673380265158013, "learning_rate": 3.22432656377317e-05, "loss": 0.3263, "step": 3911 }, { "epoch": 0.30456912072405234, "grad_norm": 1.7078444200246936, "learning_rate": 3.2239318079342075e-05, "loss": 0.3252, "step": 3912 }, { "epoch": 0.304646975816262, "grad_norm": 1.7609961952659747, "learning_rate": 3.2235369758500455e-05, "loss": 0.3439, "step": 3913 }, { "epoch": 0.30472483090847163, "grad_norm": 1.7947607237605097, "learning_rate": 3.2231420675452816e-05, "loss": 0.3324, "step": 3914 }, { "epoch": 0.3048026860006812, "grad_norm": 1.961445004051793, "learning_rate": 3.2227470830445156e-05, "loss": 0.3348, "step": 3915 }, { "epoch": 0.30488054109289087, "grad_norm": 1.6754977723742448, "learning_rate": 3.222352022372354e-05, "loss": 0.3364, "step": 3916 }, { "epoch": 0.30495839618510046, "grad_norm": 1.7608844076861452, "learning_rate": 3.221956885553407e-05, "loss": 0.3558, "step": 3917 }, { "epoch": 0.3050362512773101, "grad_norm": 1.7383016380386132, "learning_rate": 3.22156167261229e-05, "loss": 0.3227, "step": 3918 }, { "epoch": 0.30511410636951974, "grad_norm": 1.6772066223009652, "learning_rate": 3.2211663835736225e-05, "loss": 0.3206, "step": 3919 }, { "epoch": 0.30519196146172933, "grad_norm": 1.9266027980106495, "learning_rate": 3.22077101846203e-05, "loss": 0.3326, "step": 3920 }, { "epoch": 0.305269816553939, "grad_norm": 1.805955414745652, "learning_rate": 3.220375577302139e-05, "loss": 0.3373, "step": 3921 }, { "epoch": 0.3053476716461486, "grad_norm": 2.9526619443833995, "learning_rate": 3.219980060118588e-05, "loss": 0.3885, "step": 3922 }, { "epoch": 0.3054255267383582, "grad_norm": 1.9164794143801371, "learning_rate": 3.219584466936013e-05, "loss": 0.3536, "step": 3923 }, { "epoch": 0.30550338183056786, "grad_norm": 1.850624779177286, "learning_rate": 3.219188797779058e-05, "loss": 0.3254, "step": 3924 }, { "epoch": 0.3055812369227775, "grad_norm": 1.7632924667862504, "learning_rate": 3.218793052672372e-05, "loss": 0.3292, "step": 3925 }, { "epoch": 0.3056590920149871, "grad_norm": 1.7257377600857333, "learning_rate": 3.2183972316406066e-05, "loss": 0.3602, "step": 3926 }, { "epoch": 0.30573694710719673, "grad_norm": 1.6230325361318623, "learning_rate": 3.218001334708422e-05, "loss": 0.3325, "step": 3927 }, { "epoch": 0.3058148021994064, "grad_norm": 1.8432740999107056, "learning_rate": 3.217605361900478e-05, "loss": 0.3299, "step": 3928 }, { "epoch": 0.30589265729161597, "grad_norm": 1.694982030774218, "learning_rate": 3.217209313241444e-05, "loss": 0.328, "step": 3929 }, { "epoch": 0.3059705123838256, "grad_norm": 1.84856590414437, "learning_rate": 3.2168131887559916e-05, "loss": 0.3824, "step": 3930 }, { "epoch": 0.30604836747603525, "grad_norm": 1.7013580581960357, "learning_rate": 3.216416988468797e-05, "loss": 0.3147, "step": 3931 }, { "epoch": 0.30612622256824484, "grad_norm": 1.739866743785823, "learning_rate": 3.216020712404541e-05, "loss": 0.3323, "step": 3932 }, { "epoch": 0.3062040776604545, "grad_norm": 1.7760924558760964, "learning_rate": 3.215624360587911e-05, "loss": 0.3285, "step": 3933 }, { "epoch": 0.3062819327526641, "grad_norm": 1.8028796039550086, "learning_rate": 3.2152279330435965e-05, "loss": 0.3206, "step": 3934 }, { "epoch": 0.3063597878448737, "grad_norm": 1.9749213969626285, "learning_rate": 3.2148314297962945e-05, "loss": 0.3735, "step": 3935 }, { "epoch": 0.30643764293708337, "grad_norm": 1.8141986146886933, "learning_rate": 3.2144348508707044e-05, "loss": 0.3516, "step": 3936 }, { "epoch": 0.30651549802929295, "grad_norm": 2.0895247415274856, "learning_rate": 3.214038196291531e-05, "loss": 0.3908, "step": 3937 }, { "epoch": 0.3065933531215026, "grad_norm": 1.8231898620592697, "learning_rate": 3.2136414660834856e-05, "loss": 0.3675, "step": 3938 }, { "epoch": 0.30667120821371224, "grad_norm": 1.7633739713525367, "learning_rate": 3.213244660271281e-05, "loss": 0.3523, "step": 3939 }, { "epoch": 0.30674906330592183, "grad_norm": 1.5860204062175125, "learning_rate": 3.212847778879637e-05, "loss": 0.3077, "step": 3940 }, { "epoch": 0.3068269183981315, "grad_norm": 1.731300308502257, "learning_rate": 3.212450821933277e-05, "loss": 0.3227, "step": 3941 }, { "epoch": 0.3069047734903411, "grad_norm": 1.839090976151773, "learning_rate": 3.212053789456929e-05, "loss": 0.3369, "step": 3942 }, { "epoch": 0.3069826285825507, "grad_norm": 1.828511156460638, "learning_rate": 3.211656681475328e-05, "loss": 0.3389, "step": 3943 }, { "epoch": 0.30706048367476035, "grad_norm": 1.786050532572285, "learning_rate": 3.2112594980132105e-05, "loss": 0.3045, "step": 3944 }, { "epoch": 0.30713833876697, "grad_norm": 1.7111299279889216, "learning_rate": 3.2108622390953206e-05, "loss": 0.3232, "step": 3945 }, { "epoch": 0.3072161938591796, "grad_norm": 1.9046419063633089, "learning_rate": 3.2104649047464045e-05, "loss": 0.3412, "step": 3946 }, { "epoch": 0.30729404895138923, "grad_norm": 1.6875594953171933, "learning_rate": 3.210067494991214e-05, "loss": 0.3291, "step": 3947 }, { "epoch": 0.3073719040435989, "grad_norm": 1.8359052109328107, "learning_rate": 3.209670009854506e-05, "loss": 0.3576, "step": 3948 }, { "epoch": 0.30744975913580846, "grad_norm": 1.6897441437229819, "learning_rate": 3.209272449361043e-05, "loss": 0.3297, "step": 3949 }, { "epoch": 0.3075276142280181, "grad_norm": 1.7483223351850705, "learning_rate": 3.208874813535591e-05, "loss": 0.3805, "step": 3950 }, { "epoch": 0.3075276142280181, "eval_loss": 0.042082808911800385, "eval_runtime": 162.0867, "eval_samples_per_second": 17.768, "eval_steps_per_second": 0.635, "step": 3950 }, { "epoch": 0.30760546932022775, "grad_norm": 1.7947700642812268, "learning_rate": 3.2084771024029195e-05, "loss": 0.3242, "step": 3951 }, { "epoch": 0.30768332441243734, "grad_norm": 1.6919723000348317, "learning_rate": 3.2080793159878045e-05, "loss": 0.3139, "step": 3952 }, { "epoch": 0.307761179504647, "grad_norm": 1.6684120658820796, "learning_rate": 3.207681454315028e-05, "loss": 0.3284, "step": 3953 }, { "epoch": 0.3078390345968566, "grad_norm": 1.842914304444496, "learning_rate": 3.207283517409373e-05, "loss": 0.3792, "step": 3954 }, { "epoch": 0.3079168896890662, "grad_norm": 1.6939662867561995, "learning_rate": 3.206885505295629e-05, "loss": 0.316, "step": 3955 }, { "epoch": 0.30799474478127586, "grad_norm": 1.6325638426346183, "learning_rate": 3.206487417998591e-05, "loss": 0.2908, "step": 3956 }, { "epoch": 0.30807259987348545, "grad_norm": 1.6887603611253497, "learning_rate": 3.2060892555430574e-05, "loss": 0.3295, "step": 3957 }, { "epoch": 0.3081504549656951, "grad_norm": 1.6489767596740281, "learning_rate": 3.205691017953834e-05, "loss": 0.3517, "step": 3958 }, { "epoch": 0.30822831005790474, "grad_norm": 1.8979482318463512, "learning_rate": 3.205292705255726e-05, "loss": 0.3478, "step": 3959 }, { "epoch": 0.30830616515011433, "grad_norm": 1.702237280393733, "learning_rate": 3.204894317473548e-05, "loss": 0.2953, "step": 3960 }, { "epoch": 0.308384020242324, "grad_norm": 1.6658825888759636, "learning_rate": 3.204495854632118e-05, "loss": 0.3267, "step": 3961 }, { "epoch": 0.3084618753345336, "grad_norm": 1.6839959931465132, "learning_rate": 3.204097316756258e-05, "loss": 0.3014, "step": 3962 }, { "epoch": 0.3085397304267432, "grad_norm": 1.7004345769256537, "learning_rate": 3.203698703870794e-05, "loss": 0.3333, "step": 3963 }, { "epoch": 0.30861758551895285, "grad_norm": 1.6237688329599167, "learning_rate": 3.203300016000559e-05, "loss": 0.3411, "step": 3964 }, { "epoch": 0.3086954406111625, "grad_norm": 1.912280413861359, "learning_rate": 3.202901253170389e-05, "loss": 0.4093, "step": 3965 }, { "epoch": 0.3087732957033721, "grad_norm": 1.8275601077149959, "learning_rate": 3.2025024154051256e-05, "loss": 0.3191, "step": 3966 }, { "epoch": 0.30885115079558173, "grad_norm": 2.138691747095045, "learning_rate": 3.202103502729614e-05, "loss": 0.3864, "step": 3967 }, { "epoch": 0.3089290058877914, "grad_norm": 1.7715598376148167, "learning_rate": 3.201704515168704e-05, "loss": 0.3513, "step": 3968 }, { "epoch": 0.30900686098000096, "grad_norm": 1.6735903783016803, "learning_rate": 3.2013054527472517e-05, "loss": 0.3142, "step": 3969 }, { "epoch": 0.3090847160722106, "grad_norm": 1.9066094973451937, "learning_rate": 3.200906315490116e-05, "loss": 0.3463, "step": 3970 }, { "epoch": 0.3091625711644202, "grad_norm": 1.857689210711715, "learning_rate": 3.200507103422162e-05, "loss": 0.3716, "step": 3971 }, { "epoch": 0.30924042625662984, "grad_norm": 1.7642482995702369, "learning_rate": 3.200107816568258e-05, "loss": 0.3205, "step": 3972 }, { "epoch": 0.3093182813488395, "grad_norm": 1.6437719626662344, "learning_rate": 3.199708454953277e-05, "loss": 0.3443, "step": 3973 }, { "epoch": 0.3093961364410491, "grad_norm": 1.670277263008704, "learning_rate": 3.1993090186021e-05, "loss": 0.3096, "step": 3974 }, { "epoch": 0.3094739915332587, "grad_norm": 1.7864748211559378, "learning_rate": 3.198909507539608e-05, "loss": 0.3268, "step": 3975 }, { "epoch": 0.30955184662546836, "grad_norm": 1.8253743175341162, "learning_rate": 3.198509921790689e-05, "loss": 0.3539, "step": 3976 }, { "epoch": 0.30962970171767795, "grad_norm": 1.6382080499695506, "learning_rate": 3.198110261380235e-05, "loss": 0.3636, "step": 3977 }, { "epoch": 0.3097075568098876, "grad_norm": 1.632924980130089, "learning_rate": 3.197710526333143e-05, "loss": 0.328, "step": 3978 }, { "epoch": 0.30978541190209724, "grad_norm": 1.5748932958475768, "learning_rate": 3.197310716674316e-05, "loss": 0.3077, "step": 3979 }, { "epoch": 0.30986326699430683, "grad_norm": 1.8320332962793264, "learning_rate": 3.1969108324286585e-05, "loss": 0.323, "step": 3980 }, { "epoch": 0.3099411220865165, "grad_norm": 1.6518726246599524, "learning_rate": 3.196510873621082e-05, "loss": 0.2784, "step": 3981 }, { "epoch": 0.3100189771787261, "grad_norm": 1.658624744434369, "learning_rate": 3.196110840276503e-05, "loss": 0.3253, "step": 3982 }, { "epoch": 0.3100968322709357, "grad_norm": 1.7473603219742482, "learning_rate": 3.19571073241984e-05, "loss": 0.3387, "step": 3983 }, { "epoch": 0.31017468736314535, "grad_norm": 1.8669731311072615, "learning_rate": 3.195310550076019e-05, "loss": 0.3634, "step": 3984 }, { "epoch": 0.310252542455355, "grad_norm": 1.7304161085087308, "learning_rate": 3.19491029326997e-05, "loss": 0.2956, "step": 3985 }, { "epoch": 0.3103303975475646, "grad_norm": 1.8764731426696886, "learning_rate": 3.1945099620266246e-05, "loss": 0.3345, "step": 3986 }, { "epoch": 0.31040825263977423, "grad_norm": 1.8791501537334812, "learning_rate": 3.194109556370925e-05, "loss": 0.3465, "step": 3987 }, { "epoch": 0.3104861077319838, "grad_norm": 1.889111827460955, "learning_rate": 3.193709076327812e-05, "loss": 0.3355, "step": 3988 }, { "epoch": 0.31056396282419346, "grad_norm": 1.8061681483220227, "learning_rate": 3.1933085219222345e-05, "loss": 0.3411, "step": 3989 }, { "epoch": 0.3106418179164031, "grad_norm": 1.8623297987332579, "learning_rate": 3.1929078931791454e-05, "loss": 0.3357, "step": 3990 }, { "epoch": 0.3107196730086127, "grad_norm": 1.707374312945917, "learning_rate": 3.192507190123501e-05, "loss": 0.3345, "step": 3991 }, { "epoch": 0.31079752810082234, "grad_norm": 1.8080687045406467, "learning_rate": 3.1921064127802645e-05, "loss": 0.3321, "step": 3992 }, { "epoch": 0.310875383193032, "grad_norm": 1.5291885732629606, "learning_rate": 3.1917055611744017e-05, "loss": 0.3098, "step": 3993 }, { "epoch": 0.3109532382852416, "grad_norm": 1.6285607009129328, "learning_rate": 3.1913046353308843e-05, "loss": 0.3442, "step": 3994 }, { "epoch": 0.3110310933774512, "grad_norm": 1.8203276838841551, "learning_rate": 3.190903635274688e-05, "loss": 0.3472, "step": 3995 }, { "epoch": 0.31110894846966086, "grad_norm": 1.6084877774056925, "learning_rate": 3.1905025610307915e-05, "loss": 0.2653, "step": 3996 }, { "epoch": 0.31118680356187045, "grad_norm": 1.8589565051843293, "learning_rate": 3.190101412624183e-05, "loss": 0.3537, "step": 3997 }, { "epoch": 0.3112646586540801, "grad_norm": 1.8590360646902353, "learning_rate": 3.189700190079849e-05, "loss": 0.3378, "step": 3998 }, { "epoch": 0.31134251374628974, "grad_norm": 1.7233394195523384, "learning_rate": 3.189298893422787e-05, "loss": 0.2921, "step": 3999 }, { "epoch": 0.31142036883849933, "grad_norm": 1.969668118370128, "learning_rate": 3.188897522677992e-05, "loss": 0.3206, "step": 4000 }, { "epoch": 0.31142036883849933, "eval_loss": 0.04073982313275337, "eval_runtime": 162.1794, "eval_samples_per_second": 17.758, "eval_steps_per_second": 0.635, "step": 4000 }, { "epoch": 0.311498223930709, "grad_norm": 1.745152796012247, "learning_rate": 3.188496077870471e-05, "loss": 0.2966, "step": 4001 }, { "epoch": 0.3115760790229186, "grad_norm": 1.7610826381080098, "learning_rate": 3.1880945590252305e-05, "loss": 0.3615, "step": 4002 }, { "epoch": 0.3116539341151282, "grad_norm": 1.6986097717776147, "learning_rate": 3.187692966167284e-05, "loss": 0.3194, "step": 4003 }, { "epoch": 0.31173178920733785, "grad_norm": 1.7459735259351867, "learning_rate": 3.187291299321648e-05, "loss": 0.327, "step": 4004 }, { "epoch": 0.3118096442995475, "grad_norm": 1.7281945243125472, "learning_rate": 3.1868895585133445e-05, "loss": 0.3004, "step": 4005 }, { "epoch": 0.3118874993917571, "grad_norm": 1.7242097295472099, "learning_rate": 3.1864877437674003e-05, "loss": 0.3382, "step": 4006 }, { "epoch": 0.31196535448396673, "grad_norm": 1.660458288356157, "learning_rate": 3.1860858551088475e-05, "loss": 0.2922, "step": 4007 }, { "epoch": 0.3120432095761763, "grad_norm": 1.7360560754880119, "learning_rate": 3.1856838925627203e-05, "loss": 0.3932, "step": 4008 }, { "epoch": 0.31212106466838596, "grad_norm": 1.6943327802356996, "learning_rate": 3.18528185615406e-05, "loss": 0.2961, "step": 4009 }, { "epoch": 0.3121989197605956, "grad_norm": 1.6414324276259467, "learning_rate": 3.1848797459079116e-05, "loss": 0.2914, "step": 4010 }, { "epoch": 0.3122767748528052, "grad_norm": 1.6470432514868454, "learning_rate": 3.1844775618493235e-05, "loss": 0.2932, "step": 4011 }, { "epoch": 0.31235462994501484, "grad_norm": 1.6987802906994252, "learning_rate": 3.1840753040033515e-05, "loss": 0.3112, "step": 4012 }, { "epoch": 0.3124324850372245, "grad_norm": 1.8047198387609713, "learning_rate": 3.183672972395054e-05, "loss": 0.3499, "step": 4013 }, { "epoch": 0.31251034012943407, "grad_norm": 1.67278409036597, "learning_rate": 3.1832705670494935e-05, "loss": 0.2931, "step": 4014 }, { "epoch": 0.3125881952216437, "grad_norm": 1.6520086088030383, "learning_rate": 3.182868087991739e-05, "loss": 0.306, "step": 4015 }, { "epoch": 0.31266605031385336, "grad_norm": 1.838997547247915, "learning_rate": 3.182465535246863e-05, "loss": 0.3166, "step": 4016 }, { "epoch": 0.31274390540606295, "grad_norm": 1.6739584972408248, "learning_rate": 3.182062908839941e-05, "loss": 0.3131, "step": 4017 }, { "epoch": 0.3128217604982726, "grad_norm": 1.9124413483939817, "learning_rate": 3.181660208796057e-05, "loss": 0.3733, "step": 4018 }, { "epoch": 0.31289961559048224, "grad_norm": 1.7280010641975403, "learning_rate": 3.181257435140296e-05, "loss": 0.313, "step": 4019 }, { "epoch": 0.3129774706826918, "grad_norm": 1.5765420485049153, "learning_rate": 3.180854587897749e-05, "loss": 0.2977, "step": 4020 }, { "epoch": 0.31305532577490147, "grad_norm": 1.6984707001967865, "learning_rate": 3.180451667093512e-05, "loss": 0.3401, "step": 4021 }, { "epoch": 0.3131331808671111, "grad_norm": 1.7036875694892555, "learning_rate": 3.180048672752684e-05, "loss": 0.3193, "step": 4022 }, { "epoch": 0.3132110359593207, "grad_norm": 1.8231623729200819, "learning_rate": 3.1796456049003715e-05, "loss": 0.3697, "step": 4023 }, { "epoch": 0.31328889105153035, "grad_norm": 1.6987876030272242, "learning_rate": 3.179242463561681e-05, "loss": 0.3512, "step": 4024 }, { "epoch": 0.31336674614373994, "grad_norm": 1.6758138756514918, "learning_rate": 3.17883924876173e-05, "loss": 0.356, "step": 4025 }, { "epoch": 0.3134446012359496, "grad_norm": 1.7583684027712096, "learning_rate": 3.178435960525634e-05, "loss": 0.3117, "step": 4026 }, { "epoch": 0.3135224563281592, "grad_norm": 1.6427784689440255, "learning_rate": 3.178032598878517e-05, "loss": 0.3004, "step": 4027 }, { "epoch": 0.3136003114203688, "grad_norm": 1.7802540215584333, "learning_rate": 3.177629163845505e-05, "loss": 0.3073, "step": 4028 }, { "epoch": 0.31367816651257846, "grad_norm": 1.7651973048589682, "learning_rate": 3.177225655451733e-05, "loss": 0.3055, "step": 4029 }, { "epoch": 0.3137560216047881, "grad_norm": 1.6985485372377815, "learning_rate": 3.176822073722336e-05, "loss": 0.3178, "step": 4030 }, { "epoch": 0.3138338766969977, "grad_norm": 1.6826807899941498, "learning_rate": 3.176418418682454e-05, "loss": 0.2924, "step": 4031 }, { "epoch": 0.31391173178920734, "grad_norm": 1.8289952286978148, "learning_rate": 3.1760146903572356e-05, "loss": 0.3257, "step": 4032 }, { "epoch": 0.313989586881417, "grad_norm": 1.8026477950114923, "learning_rate": 3.175610888771829e-05, "loss": 0.3406, "step": 4033 }, { "epoch": 0.31406744197362657, "grad_norm": 1.657207783093427, "learning_rate": 3.17520701395139e-05, "loss": 0.3279, "step": 4034 }, { "epoch": 0.3141452970658362, "grad_norm": 1.6859399574385205, "learning_rate": 3.174803065921079e-05, "loss": 0.3279, "step": 4035 }, { "epoch": 0.31422315215804586, "grad_norm": 1.631512472018096, "learning_rate": 3.174399044706057e-05, "loss": 0.3129, "step": 4036 }, { "epoch": 0.31430100725025545, "grad_norm": 1.6699538552183493, "learning_rate": 3.1739949503314965e-05, "loss": 0.3148, "step": 4037 }, { "epoch": 0.3143788623424651, "grad_norm": 1.8174475700049109, "learning_rate": 3.173590782822568e-05, "loss": 0.334, "step": 4038 }, { "epoch": 0.31445671743467474, "grad_norm": 1.9116193280576719, "learning_rate": 3.1731865422044504e-05, "loss": 0.376, "step": 4039 }, { "epoch": 0.3145345725268843, "grad_norm": 1.6223332325722941, "learning_rate": 3.172782228502326e-05, "loss": 0.2854, "step": 4040 }, { "epoch": 0.31461242761909397, "grad_norm": 1.6854003605431518, "learning_rate": 3.172377841741381e-05, "loss": 0.3357, "step": 4041 }, { "epoch": 0.31469028271130356, "grad_norm": 1.6033298781173673, "learning_rate": 3.171973381946807e-05, "loss": 0.3395, "step": 4042 }, { "epoch": 0.3147681378035132, "grad_norm": 1.830220146156763, "learning_rate": 3.1715688491438e-05, "loss": 0.3524, "step": 4043 }, { "epoch": 0.31484599289572285, "grad_norm": 1.6781797085382948, "learning_rate": 3.1711642433575604e-05, "loss": 0.2924, "step": 4044 }, { "epoch": 0.31492384798793244, "grad_norm": 1.7975688614578627, "learning_rate": 3.1707595646132935e-05, "loss": 0.3468, "step": 4045 }, { "epoch": 0.3150017030801421, "grad_norm": 1.673710035316611, "learning_rate": 3.170354812936209e-05, "loss": 0.3295, "step": 4046 }, { "epoch": 0.3150795581723517, "grad_norm": 1.6539129557353864, "learning_rate": 3.1699499883515206e-05, "loss": 0.2821, "step": 4047 }, { "epoch": 0.3151574132645613, "grad_norm": 1.6892189263774562, "learning_rate": 3.169545090884447e-05, "loss": 0.3125, "step": 4048 }, { "epoch": 0.31523526835677096, "grad_norm": 1.8827659547658757, "learning_rate": 3.1691401205602126e-05, "loss": 0.3815, "step": 4049 }, { "epoch": 0.3153131234489806, "grad_norm": 1.5727660426193741, "learning_rate": 3.168735077404043e-05, "loss": 0.2533, "step": 4050 }, { "epoch": 0.3153131234489806, "eval_loss": 0.040250442922115326, "eval_runtime": 162.0106, "eval_samples_per_second": 17.777, "eval_steps_per_second": 0.636, "step": 4050 }, { "epoch": 0.3153909785411902, "grad_norm": 1.7812530507299287, "learning_rate": 3.168329961441172e-05, "loss": 0.339, "step": 4051 }, { "epoch": 0.31546883363339984, "grad_norm": 1.8553394359707076, "learning_rate": 3.167924772696836e-05, "loss": 0.3463, "step": 4052 }, { "epoch": 0.3155466887256095, "grad_norm": 1.6479375256863966, "learning_rate": 3.167519511196277e-05, "loss": 0.2988, "step": 4053 }, { "epoch": 0.31562454381781907, "grad_norm": 1.7023398527123923, "learning_rate": 3.1671141769647395e-05, "loss": 0.2931, "step": 4054 }, { "epoch": 0.3157023989100287, "grad_norm": 1.9198808426068596, "learning_rate": 3.166708770027475e-05, "loss": 0.3412, "step": 4055 }, { "epoch": 0.31578025400223836, "grad_norm": 1.5604932597888406, "learning_rate": 3.1663032904097387e-05, "loss": 0.2719, "step": 4056 }, { "epoch": 0.31585810909444795, "grad_norm": 1.6730668556742685, "learning_rate": 3.1658977381367884e-05, "loss": 0.3143, "step": 4057 }, { "epoch": 0.3159359641866576, "grad_norm": 1.6280862246609562, "learning_rate": 3.1654921132338894e-05, "loss": 0.2992, "step": 4058 }, { "epoch": 0.3160138192788672, "grad_norm": 1.718773693961682, "learning_rate": 3.165086415726311e-05, "loss": 0.3433, "step": 4059 }, { "epoch": 0.3160916743710768, "grad_norm": 1.7881912519559091, "learning_rate": 3.1646806456393244e-05, "loss": 0.3312, "step": 4060 }, { "epoch": 0.31616952946328647, "grad_norm": 1.7749440400655319, "learning_rate": 3.1642748029982086e-05, "loss": 0.3209, "step": 4061 }, { "epoch": 0.31624738455549606, "grad_norm": 1.7396859380884666, "learning_rate": 3.1638688878282445e-05, "loss": 0.3902, "step": 4062 }, { "epoch": 0.3163252396477057, "grad_norm": 1.5724797513187727, "learning_rate": 3.16346290015472e-05, "loss": 0.3469, "step": 4063 }, { "epoch": 0.31640309473991535, "grad_norm": 1.7989381595979788, "learning_rate": 3.1630568400029254e-05, "loss": 0.3209, "step": 4064 }, { "epoch": 0.31648094983212494, "grad_norm": 1.6456669651669773, "learning_rate": 3.1626507073981564e-05, "loss": 0.2953, "step": 4065 }, { "epoch": 0.3165588049243346, "grad_norm": 1.6531670417950943, "learning_rate": 3.162244502365713e-05, "loss": 0.3135, "step": 4066 }, { "epoch": 0.3166366600165442, "grad_norm": 1.530295157330441, "learning_rate": 3.161838224930901e-05, "loss": 0.2936, "step": 4067 }, { "epoch": 0.3167145151087538, "grad_norm": 1.648206967847613, "learning_rate": 3.161431875119028e-05, "loss": 0.3086, "step": 4068 }, { "epoch": 0.31679237020096346, "grad_norm": 1.8449058407484866, "learning_rate": 3.161025452955408e-05, "loss": 0.3199, "step": 4069 }, { "epoch": 0.3168702252931731, "grad_norm": 1.5607639354757237, "learning_rate": 3.16061895846536e-05, "loss": 0.366, "step": 4070 }, { "epoch": 0.3169480803853827, "grad_norm": 1.620492426816925, "learning_rate": 3.160212391674207e-05, "loss": 0.3041, "step": 4071 }, { "epoch": 0.31702593547759234, "grad_norm": 1.7049747441920913, "learning_rate": 3.159805752607275e-05, "loss": 0.3223, "step": 4072 }, { "epoch": 0.317103790569802, "grad_norm": 1.725235820616254, "learning_rate": 3.159399041289896e-05, "loss": 0.3182, "step": 4073 }, { "epoch": 0.31718164566201157, "grad_norm": 1.6320317238453848, "learning_rate": 3.1589922577474064e-05, "loss": 0.3157, "step": 4074 }, { "epoch": 0.3172595007542212, "grad_norm": 1.7446663494497199, "learning_rate": 3.1585854020051465e-05, "loss": 0.3092, "step": 4075 }, { "epoch": 0.31733735584643086, "grad_norm": 1.7797096782973059, "learning_rate": 3.158178474088463e-05, "loss": 0.3027, "step": 4076 }, { "epoch": 0.31741521093864045, "grad_norm": 1.6652632805557517, "learning_rate": 3.157771474022703e-05, "loss": 0.3016, "step": 4077 }, { "epoch": 0.3174930660308501, "grad_norm": 1.438894047648848, "learning_rate": 3.157364401833223e-05, "loss": 0.2649, "step": 4078 }, { "epoch": 0.3175709211230597, "grad_norm": 1.6848420110397326, "learning_rate": 3.156957257545381e-05, "loss": 0.279, "step": 4079 }, { "epoch": 0.3176487762152693, "grad_norm": 1.6632534168189967, "learning_rate": 3.15655004118454e-05, "loss": 0.3052, "step": 4080 }, { "epoch": 0.31772663130747897, "grad_norm": 1.7001553991591316, "learning_rate": 3.1561427527760676e-05, "loss": 0.332, "step": 4081 }, { "epoch": 0.31780448639968856, "grad_norm": 1.603646923443121, "learning_rate": 3.155735392345336e-05, "loss": 0.309, "step": 4082 }, { "epoch": 0.3178823414918982, "grad_norm": 1.7146697037225547, "learning_rate": 3.155327959917722e-05, "loss": 0.3256, "step": 4083 }, { "epoch": 0.31796019658410785, "grad_norm": 1.6524669119075905, "learning_rate": 3.154920455518606e-05, "loss": 0.2954, "step": 4084 }, { "epoch": 0.31803805167631743, "grad_norm": 1.6467690066992415, "learning_rate": 3.154512879173376e-05, "loss": 0.3254, "step": 4085 }, { "epoch": 0.3181159067685271, "grad_norm": 1.7319027418176496, "learning_rate": 3.154105230907418e-05, "loss": 0.3175, "step": 4086 }, { "epoch": 0.3181937618607367, "grad_norm": 1.7456184168895823, "learning_rate": 3.1536975107461306e-05, "loss": 0.3182, "step": 4087 }, { "epoch": 0.3182716169529463, "grad_norm": 1.7210257578435668, "learning_rate": 3.15328971871491e-05, "loss": 0.3215, "step": 4088 }, { "epoch": 0.31834947204515596, "grad_norm": 1.7082027718540556, "learning_rate": 3.152881854839161e-05, "loss": 0.3053, "step": 4089 }, { "epoch": 0.3184273271373656, "grad_norm": 1.579506474695, "learning_rate": 3.1524739191442924e-05, "loss": 0.2818, "step": 4090 }, { "epoch": 0.3185051822295752, "grad_norm": 1.6121682936605353, "learning_rate": 3.1520659116557154e-05, "loss": 0.31, "step": 4091 }, { "epoch": 0.31858303732178483, "grad_norm": 1.6177244343313055, "learning_rate": 3.151657832398847e-05, "loss": 0.3196, "step": 4092 }, { "epoch": 0.3186608924139945, "grad_norm": 1.6842067883031961, "learning_rate": 3.151249681399109e-05, "loss": 0.3084, "step": 4093 }, { "epoch": 0.31873874750620407, "grad_norm": 1.6406699112253156, "learning_rate": 3.150841458681927e-05, "loss": 0.2885, "step": 4094 }, { "epoch": 0.3188166025984137, "grad_norm": 1.6575736664544793, "learning_rate": 3.150433164272732e-05, "loss": 0.2969, "step": 4095 }, { "epoch": 0.3188944576906233, "grad_norm": 1.803049461470251, "learning_rate": 3.150024798196958e-05, "loss": 0.3067, "step": 4096 }, { "epoch": 0.31897231278283295, "grad_norm": 1.863861404719825, "learning_rate": 3.149616360480045e-05, "loss": 0.3339, "step": 4097 }, { "epoch": 0.3190501678750426, "grad_norm": 1.7460216392591297, "learning_rate": 3.149207851147436e-05, "loss": 0.301, "step": 4098 }, { "epoch": 0.3191280229672522, "grad_norm": 1.7106772855707941, "learning_rate": 3.1487992702245796e-05, "loss": 0.2765, "step": 4099 }, { "epoch": 0.3192058780594618, "grad_norm": 1.6558127220444039, "learning_rate": 3.148390617736929e-05, "loss": 0.3087, "step": 4100 }, { "epoch": 0.3192058780594618, "eval_loss": 0.03915274143218994, "eval_runtime": 162.3911, "eval_samples_per_second": 17.735, "eval_steps_per_second": 0.634, "step": 4100 }, { "epoch": 0.31928373315167147, "grad_norm": 1.6792205706372878, "learning_rate": 3.147981893709941e-05, "loss": 0.297, "step": 4101 }, { "epoch": 0.31936158824388106, "grad_norm": 1.6693027673007201, "learning_rate": 3.1475730981690765e-05, "loss": 0.2923, "step": 4102 }, { "epoch": 0.3194394433360907, "grad_norm": 1.6712704206530027, "learning_rate": 3.1471642311398016e-05, "loss": 0.2754, "step": 4103 }, { "epoch": 0.31951729842830034, "grad_norm": 1.7225277879278251, "learning_rate": 3.146755292647588e-05, "loss": 0.3042, "step": 4104 }, { "epoch": 0.31959515352050993, "grad_norm": 1.7465239430260469, "learning_rate": 3.1463462827179095e-05, "loss": 0.3536, "step": 4105 }, { "epoch": 0.3196730086127196, "grad_norm": 1.8480360923154757, "learning_rate": 3.145937201376246e-05, "loss": 0.3343, "step": 4106 }, { "epoch": 0.3197508637049292, "grad_norm": 1.7191962495944184, "learning_rate": 3.145528048648081e-05, "loss": 0.2988, "step": 4107 }, { "epoch": 0.3198287187971388, "grad_norm": 1.6426326290464812, "learning_rate": 3.145118824558904e-05, "loss": 0.2907, "step": 4108 }, { "epoch": 0.31990657388934846, "grad_norm": 1.679596019598352, "learning_rate": 3.1447095291342056e-05, "loss": 0.299, "step": 4109 }, { "epoch": 0.3199844289815581, "grad_norm": 1.644186034832715, "learning_rate": 3.144300162399485e-05, "loss": 0.3071, "step": 4110 }, { "epoch": 0.3200622840737677, "grad_norm": 1.6576229011996284, "learning_rate": 3.143890724380242e-05, "loss": 0.2851, "step": 4111 }, { "epoch": 0.32014013916597733, "grad_norm": 1.907457365895202, "learning_rate": 3.143481215101985e-05, "loss": 0.3373, "step": 4112 }, { "epoch": 0.3202179942581869, "grad_norm": 1.821323543925482, "learning_rate": 3.143071634590222e-05, "loss": 0.2795, "step": 4113 }, { "epoch": 0.32029584935039657, "grad_norm": 1.8172561787048174, "learning_rate": 3.142661982870469e-05, "loss": 0.3232, "step": 4114 }, { "epoch": 0.3203737044426062, "grad_norm": 1.728369789247947, "learning_rate": 3.142252259968246e-05, "loss": 0.3077, "step": 4115 }, { "epoch": 0.3204515595348158, "grad_norm": 1.6035237252012229, "learning_rate": 3.1418424659090766e-05, "loss": 0.2621, "step": 4116 }, { "epoch": 0.32052941462702544, "grad_norm": 1.6519346421578525, "learning_rate": 3.141432600718488e-05, "loss": 0.2934, "step": 4117 }, { "epoch": 0.3206072697192351, "grad_norm": 1.948992852300016, "learning_rate": 3.141022664422014e-05, "loss": 0.3175, "step": 4118 }, { "epoch": 0.3206851248114447, "grad_norm": 1.6939651089961394, "learning_rate": 3.140612657045191e-05, "loss": 0.301, "step": 4119 }, { "epoch": 0.3207629799036543, "grad_norm": 1.7172717284519872, "learning_rate": 3.1402025786135605e-05, "loss": 0.3038, "step": 4120 }, { "epoch": 0.32084083499586397, "grad_norm": 1.6646847814487395, "learning_rate": 3.13979242915267e-05, "loss": 0.3031, "step": 4121 }, { "epoch": 0.32091869008807355, "grad_norm": 1.7524048940116628, "learning_rate": 3.1393822086880675e-05, "loss": 0.2894, "step": 4122 }, { "epoch": 0.3209965451802832, "grad_norm": 1.612106070176654, "learning_rate": 3.1389719172453097e-05, "loss": 0.286, "step": 4123 }, { "epoch": 0.32107440027249284, "grad_norm": 1.762725013992349, "learning_rate": 3.138561554849955e-05, "loss": 0.2988, "step": 4124 }, { "epoch": 0.32115225536470243, "grad_norm": 1.671682200921641, "learning_rate": 3.138151121527567e-05, "loss": 0.3044, "step": 4125 }, { "epoch": 0.3212301104569121, "grad_norm": 1.6979747644427503, "learning_rate": 3.1377406173037134e-05, "loss": 0.2942, "step": 4126 }, { "epoch": 0.3213079655491217, "grad_norm": 1.6514300576505823, "learning_rate": 3.1373300422039675e-05, "loss": 0.3243, "step": 4127 }, { "epoch": 0.3213858206413313, "grad_norm": 1.6248182772455997, "learning_rate": 3.136919396253906e-05, "loss": 0.2929, "step": 4128 }, { "epoch": 0.32146367573354095, "grad_norm": 1.6041808192598193, "learning_rate": 3.1365086794791097e-05, "loss": 0.2949, "step": 4129 }, { "epoch": 0.3215415308257506, "grad_norm": 1.7391174835227694, "learning_rate": 3.136097891905165e-05, "loss": 0.3162, "step": 4130 }, { "epoch": 0.3216193859179602, "grad_norm": 1.7003721658969817, "learning_rate": 3.135687033557663e-05, "loss": 0.3169, "step": 4131 }, { "epoch": 0.32169724101016983, "grad_norm": 1.6072341977749411, "learning_rate": 3.1352761044621956e-05, "loss": 0.2653, "step": 4132 }, { "epoch": 0.3217750961023794, "grad_norm": 1.7510426429863832, "learning_rate": 3.134865104644363e-05, "loss": 0.3147, "step": 4133 }, { "epoch": 0.32185295119458907, "grad_norm": 1.6023654962422171, "learning_rate": 3.1344540341297697e-05, "loss": 0.275, "step": 4134 }, { "epoch": 0.3219308062867987, "grad_norm": 1.6840281492604394, "learning_rate": 3.134042892944022e-05, "loss": 0.3457, "step": 4135 }, { "epoch": 0.3220086613790083, "grad_norm": 1.623439264140137, "learning_rate": 3.133631681112732e-05, "loss": 0.2921, "step": 4136 }, { "epoch": 0.32208651647121794, "grad_norm": 1.7556750924526625, "learning_rate": 3.1332203986615174e-05, "loss": 0.3622, "step": 4137 }, { "epoch": 0.3221643715634276, "grad_norm": 1.6721597676265236, "learning_rate": 3.132809045615998e-05, "loss": 0.307, "step": 4138 }, { "epoch": 0.3222422266556372, "grad_norm": 1.6499038605185576, "learning_rate": 3.132397622001801e-05, "loss": 0.3367, "step": 4139 }, { "epoch": 0.3223200817478468, "grad_norm": 1.6696264173068327, "learning_rate": 3.131986127844554e-05, "loss": 0.2794, "step": 4140 }, { "epoch": 0.32239793684005646, "grad_norm": 1.6476181819851001, "learning_rate": 3.131574563169892e-05, "loss": 0.3151, "step": 4141 }, { "epoch": 0.32247579193226605, "grad_norm": 1.4624349710836606, "learning_rate": 3.131162928003454e-05, "loss": 0.2668, "step": 4142 }, { "epoch": 0.3225536470244757, "grad_norm": 1.6448572787691083, "learning_rate": 3.130751222370882e-05, "loss": 0.2766, "step": 4143 }, { "epoch": 0.32263150211668534, "grad_norm": 1.5408322542670712, "learning_rate": 3.130339446297824e-05, "loss": 0.2891, "step": 4144 }, { "epoch": 0.32270935720889493, "grad_norm": 1.642086541216229, "learning_rate": 3.1299275998099314e-05, "loss": 0.2845, "step": 4145 }, { "epoch": 0.3227872123011046, "grad_norm": 1.7439832115096612, "learning_rate": 3.1295156829328615e-05, "loss": 0.2992, "step": 4146 }, { "epoch": 0.3228650673933142, "grad_norm": 1.653713966493813, "learning_rate": 3.129103695692273e-05, "loss": 0.3061, "step": 4147 }, { "epoch": 0.3229429224855238, "grad_norm": 1.656332755095124, "learning_rate": 3.1286916381138315e-05, "loss": 0.2804, "step": 4148 }, { "epoch": 0.32302077757773345, "grad_norm": 1.522513403354207, "learning_rate": 3.1282795102232056e-05, "loss": 0.2573, "step": 4149 }, { "epoch": 0.32309863266994304, "grad_norm": 1.791559915964422, "learning_rate": 3.1278673120460705e-05, "loss": 0.3066, "step": 4150 }, { "epoch": 0.32309863266994304, "eval_loss": 0.03830171748995781, "eval_runtime": 162.4901, "eval_samples_per_second": 17.724, "eval_steps_per_second": 0.634, "step": 4150 }, { "epoch": 0.3231764877621527, "grad_norm": 1.7810481643104623, "learning_rate": 3.127455043608103e-05, "loss": 0.3113, "step": 4151 }, { "epoch": 0.32325434285436233, "grad_norm": 1.7856459736880022, "learning_rate": 3.127042704934986e-05, "loss": 0.3327, "step": 4152 }, { "epoch": 0.3233321979465719, "grad_norm": 1.5888025592497925, "learning_rate": 3.1266302960524054e-05, "loss": 0.2941, "step": 4153 }, { "epoch": 0.32341005303878156, "grad_norm": 1.7589285922252094, "learning_rate": 3.126217816986054e-05, "loss": 0.3324, "step": 4154 }, { "epoch": 0.3234879081309912, "grad_norm": 1.674836870291911, "learning_rate": 3.1258052677616255e-05, "loss": 0.2706, "step": 4155 }, { "epoch": 0.3235657632232008, "grad_norm": 1.6279487562188293, "learning_rate": 3.1253926484048206e-05, "loss": 0.3129, "step": 4156 }, { "epoch": 0.32364361831541044, "grad_norm": 1.7085883645179358, "learning_rate": 3.1249799589413444e-05, "loss": 0.3242, "step": 4157 }, { "epoch": 0.3237214734076201, "grad_norm": 1.600496234164175, "learning_rate": 3.124567199396904e-05, "loss": 0.2647, "step": 4158 }, { "epoch": 0.3237993284998297, "grad_norm": 1.7606407226469845, "learning_rate": 3.124154369797212e-05, "loss": 0.2993, "step": 4159 }, { "epoch": 0.3238771835920393, "grad_norm": 1.5633753170286722, "learning_rate": 3.123741470167988e-05, "loss": 0.2577, "step": 4160 }, { "epoch": 0.32395503868424896, "grad_norm": 1.7888282780383042, "learning_rate": 3.1233285005349525e-05, "loss": 0.294, "step": 4161 }, { "epoch": 0.32403289377645855, "grad_norm": 1.6046810326133498, "learning_rate": 3.122915460923832e-05, "loss": 0.286, "step": 4162 }, { "epoch": 0.3241107488686682, "grad_norm": 1.692325605890141, "learning_rate": 3.1225023513603553e-05, "loss": 0.3182, "step": 4163 }, { "epoch": 0.32418860396087784, "grad_norm": 1.7233985667841984, "learning_rate": 3.1220891718702584e-05, "loss": 0.3039, "step": 4164 }, { "epoch": 0.32426645905308743, "grad_norm": 1.6860582142111311, "learning_rate": 3.1216759224792804e-05, "loss": 0.3045, "step": 4165 }, { "epoch": 0.3243443141452971, "grad_norm": 1.654074757724691, "learning_rate": 3.121262603213166e-05, "loss": 0.2552, "step": 4166 }, { "epoch": 0.32442216923750666, "grad_norm": 1.7680024662486151, "learning_rate": 3.12084921409766e-05, "loss": 0.3227, "step": 4167 }, { "epoch": 0.3245000243297163, "grad_norm": 1.6094231729803419, "learning_rate": 3.1204357551585177e-05, "loss": 0.287, "step": 4168 }, { "epoch": 0.32457787942192595, "grad_norm": 1.7574755163500315, "learning_rate": 3.120022226421494e-05, "loss": 0.2919, "step": 4169 }, { "epoch": 0.32465573451413554, "grad_norm": 1.6845458015218928, "learning_rate": 3.11960862791235e-05, "loss": 0.2827, "step": 4170 }, { "epoch": 0.3247335896063452, "grad_norm": 1.7476405093855798, "learning_rate": 3.119194959656851e-05, "loss": 0.2837, "step": 4171 }, { "epoch": 0.32481144469855483, "grad_norm": 1.7726572453360003, "learning_rate": 3.118781221680768e-05, "loss": 0.309, "step": 4172 }, { "epoch": 0.3248892997907644, "grad_norm": 1.6844761572920888, "learning_rate": 3.1183674140098714e-05, "loss": 0.3171, "step": 4173 }, { "epoch": 0.32496715488297406, "grad_norm": 1.6558953408515353, "learning_rate": 3.117953536669943e-05, "loss": 0.3003, "step": 4174 }, { "epoch": 0.3250450099751837, "grad_norm": 1.6709992849495143, "learning_rate": 3.117539589686764e-05, "loss": 0.2835, "step": 4175 }, { "epoch": 0.3251228650673933, "grad_norm": 1.7104827744113895, "learning_rate": 3.1171255730861214e-05, "loss": 0.3093, "step": 4176 }, { "epoch": 0.32520072015960294, "grad_norm": 1.555199364474563, "learning_rate": 3.1167114868938066e-05, "loss": 0.2943, "step": 4177 }, { "epoch": 0.3252785752518126, "grad_norm": 1.5781448869181662, "learning_rate": 3.116297331135616e-05, "loss": 0.2672, "step": 4178 }, { "epoch": 0.3253564303440222, "grad_norm": 1.588489611623376, "learning_rate": 3.1158831058373474e-05, "loss": 0.3059, "step": 4179 }, { "epoch": 0.3254342854362318, "grad_norm": 1.5343049357654401, "learning_rate": 3.115468811024807e-05, "loss": 0.2717, "step": 4180 }, { "epoch": 0.32551214052844146, "grad_norm": 1.6321672557423144, "learning_rate": 3.1150544467238024e-05, "loss": 0.2957, "step": 4181 }, { "epoch": 0.32558999562065105, "grad_norm": 1.5990379127975185, "learning_rate": 3.114640012960147e-05, "loss": 0.2772, "step": 4182 }, { "epoch": 0.3256678507128607, "grad_norm": 1.4999227431929423, "learning_rate": 3.114225509759659e-05, "loss": 0.2564, "step": 4183 }, { "epoch": 0.32574570580507034, "grad_norm": 1.6489617636157559, "learning_rate": 3.1138109371481583e-05, "loss": 0.2739, "step": 4184 }, { "epoch": 0.32582356089727993, "grad_norm": 1.598554656993708, "learning_rate": 3.1133962951514725e-05, "loss": 0.2881, "step": 4185 }, { "epoch": 0.3259014159894896, "grad_norm": 1.7556403930832345, "learning_rate": 3.1129815837954295e-05, "loss": 0.2751, "step": 4186 }, { "epoch": 0.32597927108169916, "grad_norm": 1.653201525688253, "learning_rate": 3.112566803105866e-05, "loss": 0.2921, "step": 4187 }, { "epoch": 0.3260571261739088, "grad_norm": 1.7456331870246993, "learning_rate": 3.1121519531086204e-05, "loss": 0.3274, "step": 4188 }, { "epoch": 0.32613498126611845, "grad_norm": 1.600775746830566, "learning_rate": 3.111737033829535e-05, "loss": 0.2828, "step": 4189 }, { "epoch": 0.32621283635832804, "grad_norm": 1.6258915259483693, "learning_rate": 3.111322045294459e-05, "loss": 0.2729, "step": 4190 }, { "epoch": 0.3262906914505377, "grad_norm": 1.5995566682822, "learning_rate": 3.1109069875292425e-05, "loss": 0.2913, "step": 4191 }, { "epoch": 0.32636854654274733, "grad_norm": 1.67149674806379, "learning_rate": 3.110491860559743e-05, "loss": 0.2992, "step": 4192 }, { "epoch": 0.3264464016349569, "grad_norm": 1.7329935568433963, "learning_rate": 3.11007666441182e-05, "loss": 0.309, "step": 4193 }, { "epoch": 0.32652425672716656, "grad_norm": 1.598661995450378, "learning_rate": 3.1096613991113386e-05, "loss": 0.2808, "step": 4194 }, { "epoch": 0.3266021118193762, "grad_norm": 1.6405951017747975, "learning_rate": 3.109246064684168e-05, "loss": 0.2668, "step": 4195 }, { "epoch": 0.3266799669115858, "grad_norm": 1.7677629385659706, "learning_rate": 3.108830661156181e-05, "loss": 0.2879, "step": 4196 }, { "epoch": 0.32675782200379544, "grad_norm": 1.7262282638324846, "learning_rate": 3.108415188553257e-05, "loss": 0.294, "step": 4197 }, { "epoch": 0.3268356770960051, "grad_norm": 1.6427992773580637, "learning_rate": 3.107999646901276e-05, "loss": 0.2949, "step": 4198 }, { "epoch": 0.3269135321882147, "grad_norm": 1.7258346023481457, "learning_rate": 3.107584036226125e-05, "loss": 0.3137, "step": 4199 }, { "epoch": 0.3269913872804243, "grad_norm": 1.7580388228028745, "learning_rate": 3.107168356553694e-05, "loss": 0.2975, "step": 4200 }, { "epoch": 0.3269913872804243, "eval_loss": 0.037437744438648224, "eval_runtime": 162.2516, "eval_samples_per_second": 17.75, "eval_steps_per_second": 0.635, "step": 4200 }, { "epoch": 0.32706924237263396, "grad_norm": 1.655239730801606, "learning_rate": 3.1067526079098794e-05, "loss": 0.2911, "step": 4201 }, { "epoch": 0.32714709746484355, "grad_norm": 1.5398910290517156, "learning_rate": 3.1063367903205795e-05, "loss": 0.2917, "step": 4202 }, { "epoch": 0.3272249525570532, "grad_norm": 1.579770281406894, "learning_rate": 3.105920903811698e-05, "loss": 0.2651, "step": 4203 }, { "epoch": 0.3273028076492628, "grad_norm": 1.7481984635356884, "learning_rate": 3.105504948409141e-05, "loss": 0.3391, "step": 4204 }, { "epoch": 0.32738066274147243, "grad_norm": 1.6191915221071487, "learning_rate": 3.1050889241388236e-05, "loss": 0.2785, "step": 4205 }, { "epoch": 0.3274585178336821, "grad_norm": 1.7681567159284555, "learning_rate": 3.10467283102666e-05, "loss": 0.3064, "step": 4206 }, { "epoch": 0.32753637292589166, "grad_norm": 1.6625777301015268, "learning_rate": 3.104256669098572e-05, "loss": 0.3046, "step": 4207 }, { "epoch": 0.3276142280181013, "grad_norm": 1.6810211328689268, "learning_rate": 3.103840438380484e-05, "loss": 0.2766, "step": 4208 }, { "epoch": 0.32769208311031095, "grad_norm": 1.6755063271817148, "learning_rate": 3.1034241388983246e-05, "loss": 0.2939, "step": 4209 }, { "epoch": 0.32776993820252054, "grad_norm": 1.8990053562490232, "learning_rate": 3.103007770678028e-05, "loss": 0.3508, "step": 4210 }, { "epoch": 0.3278477932947302, "grad_norm": 1.6911730784683194, "learning_rate": 3.1025913337455324e-05, "loss": 0.3111, "step": 4211 }, { "epoch": 0.3279256483869398, "grad_norm": 1.6734990278670403, "learning_rate": 3.1021748281267786e-05, "loss": 0.2663, "step": 4212 }, { "epoch": 0.3280035034791494, "grad_norm": 1.585801533744984, "learning_rate": 3.101758253847714e-05, "loss": 0.2826, "step": 4213 }, { "epoch": 0.32808135857135906, "grad_norm": 1.763521234052735, "learning_rate": 3.101341610934289e-05, "loss": 0.3212, "step": 4214 }, { "epoch": 0.3281592136635687, "grad_norm": 1.6025497665475905, "learning_rate": 3.100924899412459e-05, "loss": 0.2801, "step": 4215 }, { "epoch": 0.3282370687557783, "grad_norm": 1.6264288935546798, "learning_rate": 3.100508119308181e-05, "loss": 0.2789, "step": 4216 }, { "epoch": 0.32831492384798794, "grad_norm": 1.8140051474877519, "learning_rate": 3.100091270647421e-05, "loss": 0.3128, "step": 4217 }, { "epoch": 0.3283927789401976, "grad_norm": 1.6777295143736783, "learning_rate": 3.0996743534561455e-05, "loss": 0.2943, "step": 4218 }, { "epoch": 0.32847063403240717, "grad_norm": 1.6025784212698897, "learning_rate": 3.099257367760327e-05, "loss": 0.2768, "step": 4219 }, { "epoch": 0.3285484891246168, "grad_norm": 1.5902242834663736, "learning_rate": 3.0988403135859414e-05, "loss": 0.2576, "step": 4220 }, { "epoch": 0.3286263442168264, "grad_norm": 1.6366330811893042, "learning_rate": 3.098423190958969e-05, "loss": 0.2758, "step": 4221 }, { "epoch": 0.32870419930903605, "grad_norm": 1.8001914393169824, "learning_rate": 3.098005999905395e-05, "loss": 0.3269, "step": 4222 }, { "epoch": 0.3287820544012457, "grad_norm": 1.807475575847759, "learning_rate": 3.0975887404512084e-05, "loss": 0.2878, "step": 4223 }, { "epoch": 0.3288599094934553, "grad_norm": 1.7156484313208955, "learning_rate": 3.097171412622402e-05, "loss": 0.2801, "step": 4224 }, { "epoch": 0.3289377645856649, "grad_norm": 1.5581050426703507, "learning_rate": 3.096754016444974e-05, "loss": 0.2734, "step": 4225 }, { "epoch": 0.32901561967787457, "grad_norm": 1.645364839663623, "learning_rate": 3.0963365519449264e-05, "loss": 0.2817, "step": 4226 }, { "epoch": 0.32909347477008416, "grad_norm": 1.5638205982865725, "learning_rate": 3.0959190191482646e-05, "loss": 0.2801, "step": 4227 }, { "epoch": 0.3291713298622938, "grad_norm": 1.7120732129765643, "learning_rate": 3.095501418080999e-05, "loss": 0.2676, "step": 4228 }, { "epoch": 0.32924918495450345, "grad_norm": 1.774923809783496, "learning_rate": 3.095083748769144e-05, "loss": 0.3289, "step": 4229 }, { "epoch": 0.32932704004671304, "grad_norm": 1.7621655619101295, "learning_rate": 3.0946660112387195e-05, "loss": 0.2762, "step": 4230 }, { "epoch": 0.3294048951389227, "grad_norm": 1.7740459986853532, "learning_rate": 3.094248205515748e-05, "loss": 0.3263, "step": 4231 }, { "epoch": 0.3294827502311323, "grad_norm": 1.660788238267335, "learning_rate": 3.093830331626256e-05, "loss": 0.3037, "step": 4232 }, { "epoch": 0.3295606053233419, "grad_norm": 1.6091949101732932, "learning_rate": 3.093412389596276e-05, "loss": 0.2903, "step": 4233 }, { "epoch": 0.32963846041555156, "grad_norm": 1.6566433188038152, "learning_rate": 3.092994379451844e-05, "loss": 0.2716, "step": 4234 }, { "epoch": 0.3297163155077612, "grad_norm": 1.6069017540379562, "learning_rate": 3.092576301219e-05, "loss": 0.2748, "step": 4235 }, { "epoch": 0.3297941705999708, "grad_norm": 1.5958271314156673, "learning_rate": 3.092158154923787e-05, "loss": 0.2631, "step": 4236 }, { "epoch": 0.32987202569218044, "grad_norm": 1.6022189511207636, "learning_rate": 3.091739940592254e-05, "loss": 0.2949, "step": 4237 }, { "epoch": 0.3299498807843901, "grad_norm": 1.571911721970866, "learning_rate": 3.091321658250456e-05, "loss": 0.272, "step": 4238 }, { "epoch": 0.33002773587659967, "grad_norm": 1.5271677885697947, "learning_rate": 3.090903307924448e-05, "loss": 0.2601, "step": 4239 }, { "epoch": 0.3301055909688093, "grad_norm": 1.6675694639963103, "learning_rate": 3.090484889640291e-05, "loss": 0.2767, "step": 4240 }, { "epoch": 0.3301834460610189, "grad_norm": 1.779264783788203, "learning_rate": 3.090066403424051e-05, "loss": 0.3286, "step": 4241 }, { "epoch": 0.33026130115322855, "grad_norm": 1.5643286681759432, "learning_rate": 3.0896478493017985e-05, "loss": 0.2725, "step": 4242 }, { "epoch": 0.3303391562454382, "grad_norm": 1.4928228422725638, "learning_rate": 3.089229227299606e-05, "loss": 0.2821, "step": 4243 }, { "epoch": 0.3304170113376478, "grad_norm": 1.7375170474843395, "learning_rate": 3.0888105374435536e-05, "loss": 0.2882, "step": 4244 }, { "epoch": 0.3304948664298574, "grad_norm": 1.6535673991747375, "learning_rate": 3.0883917797597215e-05, "loss": 0.3052, "step": 4245 }, { "epoch": 0.33057272152206707, "grad_norm": 1.6590090358821465, "learning_rate": 3.087972954274198e-05, "loss": 0.2905, "step": 4246 }, { "epoch": 0.33065057661427666, "grad_norm": 1.5911825702853437, "learning_rate": 3.0875540610130736e-05, "loss": 0.2719, "step": 4247 }, { "epoch": 0.3307284317064863, "grad_norm": 1.519420897914311, "learning_rate": 3.0871351000024425e-05, "loss": 0.2686, "step": 4248 }, { "epoch": 0.33080628679869595, "grad_norm": 1.6672619729477756, "learning_rate": 3.086716071268405e-05, "loss": 0.2891, "step": 4249 }, { "epoch": 0.33088414189090554, "grad_norm": 1.5809803738384969, "learning_rate": 3.086296974837064e-05, "loss": 0.2666, "step": 4250 }, { "epoch": 0.33088414189090554, "eval_loss": 0.03639058396220207, "eval_runtime": 162.1557, "eval_samples_per_second": 17.761, "eval_steps_per_second": 0.635, "step": 4250 }, { "epoch": 0.3309619969831152, "grad_norm": 1.5150192986275606, "learning_rate": 3.0858778107345275e-05, "loss": 0.263, "step": 4251 }, { "epoch": 0.3310398520753248, "grad_norm": 1.570144107754521, "learning_rate": 3.0854585789869086e-05, "loss": 0.256, "step": 4252 }, { "epoch": 0.3311177071675344, "grad_norm": 1.6571696101011109, "learning_rate": 3.085039279620321e-05, "loss": 0.2651, "step": 4253 }, { "epoch": 0.33119556225974406, "grad_norm": 1.674932372424156, "learning_rate": 3.0846199126608874e-05, "loss": 0.3148, "step": 4254 }, { "epoch": 0.3312734173519537, "grad_norm": 1.5062864725309277, "learning_rate": 3.08420047813473e-05, "loss": 0.262, "step": 4255 }, { "epoch": 0.3313512724441633, "grad_norm": 1.7127479176713547, "learning_rate": 3.0837809760679805e-05, "loss": 0.3348, "step": 4256 }, { "epoch": 0.33142912753637294, "grad_norm": 1.6895422946950402, "learning_rate": 3.08336140648677e-05, "loss": 0.2724, "step": 4257 }, { "epoch": 0.3315069826285825, "grad_norm": 1.6610304800306952, "learning_rate": 3.082941769417235e-05, "loss": 0.2995, "step": 4258 }, { "epoch": 0.33158483772079217, "grad_norm": 1.6178752672447545, "learning_rate": 3.0825220648855195e-05, "loss": 0.3115, "step": 4259 }, { "epoch": 0.3316626928130018, "grad_norm": 1.5141225609015316, "learning_rate": 3.082102292917767e-05, "loss": 0.27, "step": 4260 }, { "epoch": 0.3317405479052114, "grad_norm": 1.7421203535282581, "learning_rate": 3.0816824535401284e-05, "loss": 0.2681, "step": 4261 }, { "epoch": 0.33181840299742105, "grad_norm": 1.6821202255294032, "learning_rate": 3.0812625467787564e-05, "loss": 0.2915, "step": 4262 }, { "epoch": 0.3318962580896307, "grad_norm": 1.625509149378036, "learning_rate": 3.080842572659811e-05, "loss": 0.3075, "step": 4263 }, { "epoch": 0.3319741131818403, "grad_norm": 1.3475574770724499, "learning_rate": 3.080422531209453e-05, "loss": 0.2332, "step": 4264 }, { "epoch": 0.3320519682740499, "grad_norm": 1.819443447348623, "learning_rate": 3.08000242245385e-05, "loss": 0.3042, "step": 4265 }, { "epoch": 0.33212982336625957, "grad_norm": 1.5507149403845266, "learning_rate": 3.079582246419173e-05, "loss": 0.2822, "step": 4266 }, { "epoch": 0.33220767845846916, "grad_norm": 1.6633205608223278, "learning_rate": 3.0791620031315967e-05, "loss": 0.2849, "step": 4267 }, { "epoch": 0.3322855335506788, "grad_norm": 1.6254918321270437, "learning_rate": 3.078741692617299e-05, "loss": 0.251, "step": 4268 }, { "epoch": 0.33236338864288845, "grad_norm": 1.6714542932293865, "learning_rate": 3.078321314902465e-05, "loss": 0.3079, "step": 4269 }, { "epoch": 0.33244124373509804, "grad_norm": 1.719894109525594, "learning_rate": 3.077900870013283e-05, "loss": 0.2583, "step": 4270 }, { "epoch": 0.3325190988273077, "grad_norm": 1.6431574775940694, "learning_rate": 3.077480357975941e-05, "loss": 0.2873, "step": 4271 }, { "epoch": 0.3325969539195173, "grad_norm": 1.6015033770306168, "learning_rate": 3.0770597788166385e-05, "loss": 0.2898, "step": 4272 }, { "epoch": 0.3326748090117269, "grad_norm": 1.6362418538086652, "learning_rate": 3.076639132561574e-05, "loss": 0.3013, "step": 4273 }, { "epoch": 0.33275266410393656, "grad_norm": 1.4569842492531226, "learning_rate": 3.076218419236953e-05, "loss": 0.2544, "step": 4274 }, { "epoch": 0.33283051919614615, "grad_norm": 1.594870076128406, "learning_rate": 3.0757976388689827e-05, "loss": 0.2906, "step": 4275 }, { "epoch": 0.3329083742883558, "grad_norm": 1.7516627737564203, "learning_rate": 3.0753767914838755e-05, "loss": 0.3147, "step": 4276 }, { "epoch": 0.33298622938056543, "grad_norm": 1.5896873052468743, "learning_rate": 3.07495587710785e-05, "loss": 0.2833, "step": 4277 }, { "epoch": 0.333064084472775, "grad_norm": 1.6903188351601142, "learning_rate": 3.0745348957671256e-05, "loss": 0.3129, "step": 4278 }, { "epoch": 0.33314193956498467, "grad_norm": 1.6862207976293415, "learning_rate": 3.074113847487928e-05, "loss": 0.3056, "step": 4279 }, { "epoch": 0.3332197946571943, "grad_norm": 1.6322920642019383, "learning_rate": 3.073692732296486e-05, "loss": 0.2933, "step": 4280 }, { "epoch": 0.3332976497494039, "grad_norm": 1.6226328777401124, "learning_rate": 3.073271550219034e-05, "loss": 0.2449, "step": 4281 }, { "epoch": 0.33337550484161355, "grad_norm": 1.591454485902364, "learning_rate": 3.072850301281809e-05, "loss": 0.285, "step": 4282 }, { "epoch": 0.3334533599338232, "grad_norm": 1.598660461851551, "learning_rate": 3.072428985511054e-05, "loss": 0.2534, "step": 4283 }, { "epoch": 0.3335312150260328, "grad_norm": 1.6286296522138972, "learning_rate": 3.0720076029330134e-05, "loss": 0.3086, "step": 4284 }, { "epoch": 0.3336090701182424, "grad_norm": 1.660033879895089, "learning_rate": 3.071586153573938e-05, "loss": 0.3037, "step": 4285 }, { "epoch": 0.33368692521045207, "grad_norm": 1.6640306101284306, "learning_rate": 3.071164637460082e-05, "loss": 0.298, "step": 4286 }, { "epoch": 0.33376478030266166, "grad_norm": 1.5800971683583447, "learning_rate": 3.070743054617705e-05, "loss": 0.2758, "step": 4287 }, { "epoch": 0.3338426353948713, "grad_norm": 1.6551160588972404, "learning_rate": 3.070321405073067e-05, "loss": 0.2896, "step": 4288 }, { "epoch": 0.33392049048708095, "grad_norm": 1.6396091853807802, "learning_rate": 3.0698996888524376e-05, "loss": 0.2723, "step": 4289 }, { "epoch": 0.33399834557929053, "grad_norm": 1.5432500274119274, "learning_rate": 3.069477905982086e-05, "loss": 0.2892, "step": 4290 }, { "epoch": 0.3340762006715002, "grad_norm": 1.5419454288039156, "learning_rate": 3.0690560564882875e-05, "loss": 0.2637, "step": 4291 }, { "epoch": 0.33415405576370977, "grad_norm": 1.6786628908304249, "learning_rate": 3.0686341403973225e-05, "loss": 0.3088, "step": 4292 }, { "epoch": 0.3342319108559194, "grad_norm": 1.547148178642243, "learning_rate": 3.068212157735474e-05, "loss": 0.2671, "step": 4293 }, { "epoch": 0.33430976594812906, "grad_norm": 1.5994496896012942, "learning_rate": 3.067790108529028e-05, "loss": 0.3022, "step": 4294 }, { "epoch": 0.33438762104033865, "grad_norm": 1.69479083731392, "learning_rate": 3.067367992804278e-05, "loss": 0.2646, "step": 4295 }, { "epoch": 0.3344654761325483, "grad_norm": 1.5324646083093274, "learning_rate": 3.066945810587518e-05, "loss": 0.2222, "step": 4296 }, { "epoch": 0.33454333122475793, "grad_norm": 1.709352484486542, "learning_rate": 3.06652356190505e-05, "loss": 0.2937, "step": 4297 }, { "epoch": 0.3346211863169675, "grad_norm": 1.5110942587638787, "learning_rate": 3.066101246783178e-05, "loss": 0.2397, "step": 4298 }, { "epoch": 0.33469904140917717, "grad_norm": 1.501175725101388, "learning_rate": 3.065678865248208e-05, "loss": 0.2715, "step": 4299 }, { "epoch": 0.3347768965013868, "grad_norm": 1.7033919542334957, "learning_rate": 3.065256417326454e-05, "loss": 0.3093, "step": 4300 }, { "epoch": 0.3347768965013868, "eval_loss": 0.03514719381928444, "eval_runtime": 162.3306, "eval_samples_per_second": 17.742, "eval_steps_per_second": 0.635, "step": 4300 }, { "epoch": 0.3348547515935964, "grad_norm": 1.6415493255128712, "learning_rate": 3.0648339030442326e-05, "loss": 0.2781, "step": 4301 }, { "epoch": 0.33493260668580604, "grad_norm": 1.5031389135762612, "learning_rate": 3.064411322427865e-05, "loss": 0.2903, "step": 4302 }, { "epoch": 0.3350104617780157, "grad_norm": 1.5719327133233798, "learning_rate": 3.0639886755036746e-05, "loss": 0.2562, "step": 4303 }, { "epoch": 0.3350883168702253, "grad_norm": 1.6969599426443216, "learning_rate": 3.063565962297991e-05, "loss": 0.2894, "step": 4304 }, { "epoch": 0.3351661719624349, "grad_norm": 1.644363051261709, "learning_rate": 3.063143182837147e-05, "loss": 0.2584, "step": 4305 }, { "epoch": 0.33524402705464457, "grad_norm": 1.7299394474989287, "learning_rate": 3.0627203371474795e-05, "loss": 0.3314, "step": 4306 }, { "epoch": 0.33532188214685416, "grad_norm": 1.6311764367392183, "learning_rate": 3.0622974252553306e-05, "loss": 0.3057, "step": 4307 }, { "epoch": 0.3353997372390638, "grad_norm": 1.8077216618204437, "learning_rate": 3.061874447187045e-05, "loss": 0.3264, "step": 4308 }, { "epoch": 0.33547759233127344, "grad_norm": 1.6183347626751554, "learning_rate": 3.061451402968973e-05, "loss": 0.2862, "step": 4309 }, { "epoch": 0.33555544742348303, "grad_norm": 1.5177177191008229, "learning_rate": 3.061028292627468e-05, "loss": 0.2729, "step": 4310 }, { "epoch": 0.3356333025156927, "grad_norm": 1.5474771833077143, "learning_rate": 3.060605116188887e-05, "loss": 0.2648, "step": 4311 }, { "epoch": 0.33571115760790227, "grad_norm": 1.4926505188589407, "learning_rate": 3.060181873679594e-05, "loss": 0.2705, "step": 4312 }, { "epoch": 0.3357890127001119, "grad_norm": 1.6566197656219872, "learning_rate": 3.059758565125952e-05, "loss": 0.2801, "step": 4313 }, { "epoch": 0.33586686779232156, "grad_norm": 1.6282916784935135, "learning_rate": 3.059335190554333e-05, "loss": 0.2567, "step": 4314 }, { "epoch": 0.33594472288453114, "grad_norm": 1.6611664476061636, "learning_rate": 3.0589117499911115e-05, "loss": 0.292, "step": 4315 }, { "epoch": 0.3360225779767408, "grad_norm": 1.7678015705970072, "learning_rate": 3.058488243462665e-05, "loss": 0.2712, "step": 4316 }, { "epoch": 0.33610043306895043, "grad_norm": 1.6379527596929253, "learning_rate": 3.0580646709953766e-05, "loss": 0.2792, "step": 4317 }, { "epoch": 0.33617828816116, "grad_norm": 1.5390913779120634, "learning_rate": 3.057641032615632e-05, "loss": 0.2497, "step": 4318 }, { "epoch": 0.33625614325336967, "grad_norm": 1.5263780718721478, "learning_rate": 3.0572173283498226e-05, "loss": 0.2748, "step": 4319 }, { "epoch": 0.3363339983455793, "grad_norm": 1.6491079746767272, "learning_rate": 3.0567935582243426e-05, "loss": 0.2724, "step": 4320 }, { "epoch": 0.3364118534377889, "grad_norm": 1.5483051539351955, "learning_rate": 3.0563697222655924e-05, "loss": 0.2772, "step": 4321 }, { "epoch": 0.33648970852999854, "grad_norm": 1.6164545447956324, "learning_rate": 3.055945820499973e-05, "loss": 0.2831, "step": 4322 }, { "epoch": 0.3365675636222082, "grad_norm": 1.5037724854506112, "learning_rate": 3.055521852953892e-05, "loss": 0.2572, "step": 4323 }, { "epoch": 0.3366454187144178, "grad_norm": 1.5935922806361427, "learning_rate": 3.0550978196537616e-05, "loss": 0.2663, "step": 4324 }, { "epoch": 0.3367232738066274, "grad_norm": 1.5778683604998751, "learning_rate": 3.054673720625997e-05, "loss": 0.2452, "step": 4325 }, { "epoch": 0.33680112889883707, "grad_norm": 1.5459622338358698, "learning_rate": 3.054249555897016e-05, "loss": 0.2458, "step": 4326 }, { "epoch": 0.33687898399104665, "grad_norm": 1.609182014044515, "learning_rate": 3.053825325493244e-05, "loss": 0.2608, "step": 4327 }, { "epoch": 0.3369568390832563, "grad_norm": 1.7276633187842967, "learning_rate": 3.0534010294411065e-05, "loss": 0.2905, "step": 4328 }, { "epoch": 0.3370346941754659, "grad_norm": 1.5387456627424498, "learning_rate": 3.052976667767037e-05, "loss": 0.2623, "step": 4329 }, { "epoch": 0.33711254926767553, "grad_norm": 1.6624660018341528, "learning_rate": 3.0525522404974703e-05, "loss": 0.2891, "step": 4330 }, { "epoch": 0.3371904043598852, "grad_norm": 1.5406906465316166, "learning_rate": 3.052127747658847e-05, "loss": 0.2438, "step": 4331 }, { "epoch": 0.33726825945209477, "grad_norm": 1.7440113987186125, "learning_rate": 3.0517031892776095e-05, "loss": 0.2838, "step": 4332 }, { "epoch": 0.3373461145443044, "grad_norm": 1.5587534181982314, "learning_rate": 3.0512785653802078e-05, "loss": 0.2849, "step": 4333 }, { "epoch": 0.33742396963651405, "grad_norm": 1.6123947711965134, "learning_rate": 3.050853875993092e-05, "loss": 0.256, "step": 4334 }, { "epoch": 0.33750182472872364, "grad_norm": 1.687226241978756, "learning_rate": 3.0504291211427193e-05, "loss": 0.279, "step": 4335 }, { "epoch": 0.3375796798209333, "grad_norm": 1.615317503306909, "learning_rate": 3.05000430085555e-05, "loss": 0.2893, "step": 4336 }, { "epoch": 0.33765753491314293, "grad_norm": 1.6375539967440924, "learning_rate": 3.0495794151580483e-05, "loss": 0.2685, "step": 4337 }, { "epoch": 0.3377353900053525, "grad_norm": 1.5632463703194974, "learning_rate": 3.049154464076683e-05, "loss": 0.2525, "step": 4338 }, { "epoch": 0.33781324509756216, "grad_norm": 1.6939234455271752, "learning_rate": 3.0487294476379253e-05, "loss": 0.294, "step": 4339 }, { "epoch": 0.3378911001897718, "grad_norm": 1.790772449413786, "learning_rate": 3.0483043658682524e-05, "loss": 0.3176, "step": 4340 }, { "epoch": 0.3379689552819814, "grad_norm": 1.486074049937129, "learning_rate": 3.0478792187941458e-05, "loss": 0.2557, "step": 4341 }, { "epoch": 0.33804681037419104, "grad_norm": 1.6044743020846166, "learning_rate": 3.0474540064420885e-05, "loss": 0.2712, "step": 4342 }, { "epoch": 0.3381246654664007, "grad_norm": 1.6196846608042093, "learning_rate": 3.0470287288385702e-05, "loss": 0.2719, "step": 4343 }, { "epoch": 0.3382025205586103, "grad_norm": 1.6823527867972252, "learning_rate": 3.046603386010084e-05, "loss": 0.3144, "step": 4344 }, { "epoch": 0.3382803756508199, "grad_norm": 1.6224443895162477, "learning_rate": 3.046177977983126e-05, "loss": 0.2829, "step": 4345 }, { "epoch": 0.3383582307430295, "grad_norm": 1.6383302080563575, "learning_rate": 3.0457525047841975e-05, "loss": 0.3024, "step": 4346 }, { "epoch": 0.33843608583523915, "grad_norm": 1.5822357298069838, "learning_rate": 3.0453269664398038e-05, "loss": 0.2586, "step": 4347 }, { "epoch": 0.3385139409274488, "grad_norm": 1.5316833963620642, "learning_rate": 3.044901362976453e-05, "loss": 0.2333, "step": 4348 }, { "epoch": 0.3385917960196584, "grad_norm": 1.7699688862040146, "learning_rate": 3.044475694420659e-05, "loss": 0.313, "step": 4349 }, { "epoch": 0.33866965111186803, "grad_norm": 1.5653750432746856, "learning_rate": 3.0440499607989385e-05, "loss": 0.2569, "step": 4350 }, { "epoch": 0.33866965111186803, "eval_loss": 0.03495045751333237, "eval_runtime": 162.1826, "eval_samples_per_second": 17.758, "eval_steps_per_second": 0.635, "step": 4350 }, { "epoch": 0.3387475062040777, "grad_norm": 1.5505948117197947, "learning_rate": 3.043624162137813e-05, "loss": 0.2867, "step": 4351 }, { "epoch": 0.33882536129628726, "grad_norm": 1.6199776372337964, "learning_rate": 3.043198298463808e-05, "loss": 0.2616, "step": 4352 }, { "epoch": 0.3389032163884969, "grad_norm": 1.5705443220831121, "learning_rate": 3.0427723698034515e-05, "loss": 0.277, "step": 4353 }, { "epoch": 0.33898107148070655, "grad_norm": 1.5879626114427698, "learning_rate": 3.0423463761832786e-05, "loss": 0.2604, "step": 4354 }, { "epoch": 0.33905892657291614, "grad_norm": 1.7250217330632374, "learning_rate": 3.041920317629826e-05, "loss": 0.2916, "step": 4355 }, { "epoch": 0.3391367816651258, "grad_norm": 1.5993400206177126, "learning_rate": 3.041494194169635e-05, "loss": 0.2697, "step": 4356 }, { "epoch": 0.33921463675733543, "grad_norm": 1.7415183175431115, "learning_rate": 3.041068005829251e-05, "loss": 0.347, "step": 4357 }, { "epoch": 0.339292491849545, "grad_norm": 1.61423005852688, "learning_rate": 3.0406417526352237e-05, "loss": 0.275, "step": 4358 }, { "epoch": 0.33937034694175466, "grad_norm": 1.6161109612391062, "learning_rate": 3.0402154346141067e-05, "loss": 0.2701, "step": 4359 }, { "epoch": 0.3394482020339643, "grad_norm": 1.5884832061620704, "learning_rate": 3.0397890517924576e-05, "loss": 0.2629, "step": 4360 }, { "epoch": 0.3395260571261739, "grad_norm": 1.4512441545789847, "learning_rate": 3.0393626041968383e-05, "loss": 0.2268, "step": 4361 }, { "epoch": 0.33960391221838354, "grad_norm": 1.6118554136708363, "learning_rate": 3.038936091853814e-05, "loss": 0.2431, "step": 4362 }, { "epoch": 0.3396817673105932, "grad_norm": 1.5313311997904326, "learning_rate": 3.038509514789954e-05, "loss": 0.2759, "step": 4363 }, { "epoch": 0.3397596224028028, "grad_norm": 1.6477498245424225, "learning_rate": 3.038082873031833e-05, "loss": 0.2797, "step": 4364 }, { "epoch": 0.3398374774950124, "grad_norm": 1.5409911586172922, "learning_rate": 3.037656166606029e-05, "loss": 0.2262, "step": 4365 }, { "epoch": 0.339915332587222, "grad_norm": 1.563666051578862, "learning_rate": 3.0372293955391224e-05, "loss": 0.2701, "step": 4366 }, { "epoch": 0.33999318767943165, "grad_norm": 1.7453582408679738, "learning_rate": 3.0368025598577008e-05, "loss": 0.2811, "step": 4367 }, { "epoch": 0.3400710427716413, "grad_norm": 1.8317446180109467, "learning_rate": 3.0363756595883523e-05, "loss": 0.2892, "step": 4368 }, { "epoch": 0.3401488978638509, "grad_norm": 1.6559202221406262, "learning_rate": 3.0359486947576718e-05, "loss": 0.26, "step": 4369 }, { "epoch": 0.34022675295606053, "grad_norm": 1.5263793947474218, "learning_rate": 3.0355216653922575e-05, "loss": 0.2463, "step": 4370 }, { "epoch": 0.3403046080482702, "grad_norm": 1.6279679130136826, "learning_rate": 3.0350945715187103e-05, "loss": 0.3141, "step": 4371 }, { "epoch": 0.34038246314047976, "grad_norm": 1.5854939792413878, "learning_rate": 3.0346674131636368e-05, "loss": 0.2489, "step": 4372 }, { "epoch": 0.3404603182326894, "grad_norm": 1.7236000358318744, "learning_rate": 3.0342401903536464e-05, "loss": 0.2814, "step": 4373 }, { "epoch": 0.34053817332489905, "grad_norm": 1.7265770748157596, "learning_rate": 3.0338129031153546e-05, "loss": 0.2823, "step": 4374 }, { "epoch": 0.34061602841710864, "grad_norm": 1.8290121385365363, "learning_rate": 3.0333855514753775e-05, "loss": 0.3026, "step": 4375 }, { "epoch": 0.3406938835093183, "grad_norm": 1.6326910917224724, "learning_rate": 3.032958135460338e-05, "loss": 0.275, "step": 4376 }, { "epoch": 0.34077173860152793, "grad_norm": 1.6752226922983966, "learning_rate": 3.032530655096862e-05, "loss": 0.277, "step": 4377 }, { "epoch": 0.3408495936937375, "grad_norm": 1.6036246162337626, "learning_rate": 3.032103110411579e-05, "loss": 0.281, "step": 4378 }, { "epoch": 0.34092744878594716, "grad_norm": 1.5244653188728374, "learning_rate": 3.0316755014311243e-05, "loss": 0.2455, "step": 4379 }, { "epoch": 0.3410053038781568, "grad_norm": 1.6799100953225183, "learning_rate": 3.031247828182135e-05, "loss": 0.273, "step": 4380 }, { "epoch": 0.3410831589703664, "grad_norm": 1.505885596424911, "learning_rate": 3.0308200906912534e-05, "loss": 0.2354, "step": 4381 }, { "epoch": 0.34116101406257604, "grad_norm": 1.4096371361706725, "learning_rate": 3.030392288985125e-05, "loss": 0.2187, "step": 4382 }, { "epoch": 0.34123886915478563, "grad_norm": 1.5029030729453141, "learning_rate": 3.0299644230904005e-05, "loss": 0.2725, "step": 4383 }, { "epoch": 0.3413167242469953, "grad_norm": 1.5834616844151708, "learning_rate": 3.029536493033734e-05, "loss": 0.2325, "step": 4384 }, { "epoch": 0.3413945793392049, "grad_norm": 1.6338936821294754, "learning_rate": 3.0291084988417832e-05, "loss": 0.2886, "step": 4385 }, { "epoch": 0.3414724344314145, "grad_norm": 1.560769647880621, "learning_rate": 3.0286804405412097e-05, "loss": 0.266, "step": 4386 }, { "epoch": 0.34155028952362415, "grad_norm": 1.7007379624451227, "learning_rate": 3.02825231815868e-05, "loss": 0.3138, "step": 4387 }, { "epoch": 0.3416281446158338, "grad_norm": 1.653837723413717, "learning_rate": 3.0278241317208644e-05, "loss": 0.3131, "step": 4388 }, { "epoch": 0.3417059997080434, "grad_norm": 1.5606396010738661, "learning_rate": 3.027395881254436e-05, "loss": 0.2926, "step": 4389 }, { "epoch": 0.34178385480025303, "grad_norm": 1.591346439813922, "learning_rate": 3.0269675667860743e-05, "loss": 0.2992, "step": 4390 }, { "epoch": 0.3418617098924627, "grad_norm": 1.4280799696934676, "learning_rate": 3.02653918834246e-05, "loss": 0.2369, "step": 4391 }, { "epoch": 0.34193956498467226, "grad_norm": 1.574830620804573, "learning_rate": 3.0261107459502787e-05, "loss": 0.2387, "step": 4392 }, { "epoch": 0.3420174200768819, "grad_norm": 1.6929573948330787, "learning_rate": 3.025682239636222e-05, "loss": 0.262, "step": 4393 }, { "epoch": 0.34209527516909155, "grad_norm": 1.640018101889868, "learning_rate": 3.0252536694269827e-05, "loss": 0.268, "step": 4394 }, { "epoch": 0.34217313026130114, "grad_norm": 1.5213302323394622, "learning_rate": 3.0248250353492588e-05, "loss": 0.2426, "step": 4395 }, { "epoch": 0.3422509853535108, "grad_norm": 1.7493411289064318, "learning_rate": 3.0243963374297523e-05, "loss": 0.2971, "step": 4396 }, { "epoch": 0.34232884044572043, "grad_norm": 1.58314705115181, "learning_rate": 3.023967575695169e-05, "loss": 0.2736, "step": 4397 }, { "epoch": 0.34240669553793, "grad_norm": 1.674901581970615, "learning_rate": 3.0235387501722192e-05, "loss": 0.2946, "step": 4398 }, { "epoch": 0.34248455063013966, "grad_norm": 1.5401095623078205, "learning_rate": 3.023109860887616e-05, "loss": 0.2461, "step": 4399 }, { "epoch": 0.34256240572234925, "grad_norm": 1.6151005972726895, "learning_rate": 3.0226809078680777e-05, "loss": 0.2938, "step": 4400 }, { "epoch": 0.34256240572234925, "eval_loss": 0.03407171368598938, "eval_runtime": 163.3885, "eval_samples_per_second": 17.627, "eval_steps_per_second": 0.63, "step": 4400 }, { "epoch": 0.3426402608145589, "grad_norm": 1.4316129210084607, "learning_rate": 3.022251891140325e-05, "loss": 0.2495, "step": 4401 }, { "epoch": 0.34271811590676854, "grad_norm": 1.6784723087311175, "learning_rate": 3.0218228107310853e-05, "loss": 0.3134, "step": 4402 }, { "epoch": 0.34279597099897813, "grad_norm": 1.538303362176691, "learning_rate": 3.021393666667088e-05, "loss": 0.2568, "step": 4403 }, { "epoch": 0.34287382609118777, "grad_norm": 1.5792729828109404, "learning_rate": 3.0209644589750654e-05, "loss": 0.254, "step": 4404 }, { "epoch": 0.3429516811833974, "grad_norm": 1.5610259781953868, "learning_rate": 3.0205351876817563e-05, "loss": 0.2375, "step": 4405 }, { "epoch": 0.343029536275607, "grad_norm": 1.5801756446139819, "learning_rate": 3.0201058528139015e-05, "loss": 0.2806, "step": 4406 }, { "epoch": 0.34310739136781665, "grad_norm": 1.5814746714917134, "learning_rate": 3.0196764543982472e-05, "loss": 0.2794, "step": 4407 }, { "epoch": 0.3431852464600263, "grad_norm": 1.6140357900634128, "learning_rate": 3.0192469924615434e-05, "loss": 0.2672, "step": 4408 }, { "epoch": 0.3432631015522359, "grad_norm": 1.587216552759335, "learning_rate": 3.018817467030542e-05, "loss": 0.272, "step": 4409 }, { "epoch": 0.3433409566444455, "grad_norm": 1.5806968862092552, "learning_rate": 3.0183878781320017e-05, "loss": 0.2665, "step": 4410 }, { "epoch": 0.34341881173665517, "grad_norm": 1.4106008671840573, "learning_rate": 3.017958225792683e-05, "loss": 0.2467, "step": 4411 }, { "epoch": 0.34349666682886476, "grad_norm": 1.5903320378734116, "learning_rate": 3.0175285100393517e-05, "loss": 0.2567, "step": 4412 }, { "epoch": 0.3435745219210744, "grad_norm": 1.5775103381815996, "learning_rate": 3.017098730898778e-05, "loss": 0.2725, "step": 4413 }, { "epoch": 0.34365237701328405, "grad_norm": 1.4783030875099485, "learning_rate": 3.0166688883977327e-05, "loss": 0.2362, "step": 4414 }, { "epoch": 0.34373023210549364, "grad_norm": 1.5430864211456856, "learning_rate": 3.016238982562995e-05, "loss": 0.2808, "step": 4415 }, { "epoch": 0.3438080871977033, "grad_norm": 1.5215687371180862, "learning_rate": 3.015809013421346e-05, "loss": 0.2459, "step": 4416 }, { "epoch": 0.3438859422899129, "grad_norm": 1.6536867471380907, "learning_rate": 3.01537898099957e-05, "loss": 0.2832, "step": 4417 }, { "epoch": 0.3439637973821225, "grad_norm": 1.6769109678819407, "learning_rate": 3.014948885324456e-05, "loss": 0.2542, "step": 4418 }, { "epoch": 0.34404165247433216, "grad_norm": 1.669745375297674, "learning_rate": 3.0145187264227966e-05, "loss": 0.2767, "step": 4419 }, { "epoch": 0.34411950756654175, "grad_norm": 1.6190196503942091, "learning_rate": 3.0140885043213893e-05, "loss": 0.2728, "step": 4420 }, { "epoch": 0.3441973626587514, "grad_norm": 1.6233591924931572, "learning_rate": 3.0136582190470354e-05, "loss": 0.2599, "step": 4421 }, { "epoch": 0.34427521775096104, "grad_norm": 1.484276696454071, "learning_rate": 3.0132278706265388e-05, "loss": 0.2563, "step": 4422 }, { "epoch": 0.3443530728431706, "grad_norm": 1.5592646509486523, "learning_rate": 3.012797459086709e-05, "loss": 0.2789, "step": 4423 }, { "epoch": 0.34443092793538027, "grad_norm": 1.487896289169735, "learning_rate": 3.0123669844543574e-05, "loss": 0.2522, "step": 4424 }, { "epoch": 0.3445087830275899, "grad_norm": 1.647530922242364, "learning_rate": 3.011936446756301e-05, "loss": 0.281, "step": 4425 }, { "epoch": 0.3445866381197995, "grad_norm": 1.5594519485751692, "learning_rate": 3.011505846019361e-05, "loss": 0.2578, "step": 4426 }, { "epoch": 0.34466449321200915, "grad_norm": 1.680427556479744, "learning_rate": 3.0110751822703612e-05, "loss": 0.298, "step": 4427 }, { "epoch": 0.3447423483042188, "grad_norm": 1.7031687379062546, "learning_rate": 3.01064445553613e-05, "loss": 0.2753, "step": 4428 }, { "epoch": 0.3448202033964284, "grad_norm": 1.6290169378452952, "learning_rate": 3.0102136658435e-05, "loss": 0.2481, "step": 4429 }, { "epoch": 0.344898058488638, "grad_norm": 1.5279986020429699, "learning_rate": 3.0097828132193063e-05, "loss": 0.2424, "step": 4430 }, { "epoch": 0.34497591358084767, "grad_norm": 1.5715441811631718, "learning_rate": 3.0093518976903903e-05, "loss": 0.2738, "step": 4431 }, { "epoch": 0.34505376867305726, "grad_norm": 1.4432824124120376, "learning_rate": 3.0089209192835952e-05, "loss": 0.2435, "step": 4432 }, { "epoch": 0.3451316237652669, "grad_norm": 1.593147967042801, "learning_rate": 3.0084898780257693e-05, "loss": 0.2691, "step": 4433 }, { "epoch": 0.34520947885747655, "grad_norm": 1.4345327069733171, "learning_rate": 3.0080587739437644e-05, "loss": 0.2549, "step": 4434 }, { "epoch": 0.34528733394968614, "grad_norm": 1.7656101883867998, "learning_rate": 3.0076276070644357e-05, "loss": 0.3082, "step": 4435 }, { "epoch": 0.3453651890418958, "grad_norm": 1.6559550140962307, "learning_rate": 3.0071963774146438e-05, "loss": 0.2966, "step": 4436 }, { "epoch": 0.34544304413410537, "grad_norm": 1.5472228996273845, "learning_rate": 3.0067650850212516e-05, "loss": 0.2523, "step": 4437 }, { "epoch": 0.345520899226315, "grad_norm": 1.4380672732745077, "learning_rate": 3.006333729911127e-05, "loss": 0.2325, "step": 4438 }, { "epoch": 0.34559875431852466, "grad_norm": 1.4662501901614537, "learning_rate": 3.0059023121111414e-05, "loss": 0.2585, "step": 4439 }, { "epoch": 0.34567660941073425, "grad_norm": 1.5651442140115996, "learning_rate": 3.005470831648169e-05, "loss": 0.2558, "step": 4440 }, { "epoch": 0.3457544645029439, "grad_norm": 1.555415591496988, "learning_rate": 3.0050392885490914e-05, "loss": 0.2634, "step": 4441 }, { "epoch": 0.34583231959515354, "grad_norm": 1.863113770434197, "learning_rate": 3.004607682840789e-05, "loss": 0.304, "step": 4442 }, { "epoch": 0.3459101746873631, "grad_norm": 1.610631749500613, "learning_rate": 3.0041760145501505e-05, "loss": 0.3276, "step": 4443 }, { "epoch": 0.34598802977957277, "grad_norm": 1.545244650707654, "learning_rate": 3.0037442837040665e-05, "loss": 0.2245, "step": 4444 }, { "epoch": 0.3460658848717824, "grad_norm": 1.5363622297244912, "learning_rate": 3.003312490329431e-05, "loss": 0.2461, "step": 4445 }, { "epoch": 0.346143739963992, "grad_norm": 1.7401766752862664, "learning_rate": 3.0028806344531443e-05, "loss": 0.2817, "step": 4446 }, { "epoch": 0.34622159505620165, "grad_norm": 1.7070045667390439, "learning_rate": 3.0024487161021073e-05, "loss": 0.2975, "step": 4447 }, { "epoch": 0.3462994501484113, "grad_norm": 1.5435085292288702, "learning_rate": 3.002016735303227e-05, "loss": 0.2704, "step": 4448 }, { "epoch": 0.3463773052406209, "grad_norm": 1.5391388728713384, "learning_rate": 3.0015846920834146e-05, "loss": 0.2369, "step": 4449 }, { "epoch": 0.3464551603328305, "grad_norm": 1.640102874839711, "learning_rate": 3.0011525864695838e-05, "loss": 0.2867, "step": 4450 }, { "epoch": 0.3464551603328305, "eval_loss": 0.03316964581608772, "eval_runtime": 161.8604, "eval_samples_per_second": 17.793, "eval_steps_per_second": 0.636, "step": 4450 }, { "epoch": 0.34653301542504017, "grad_norm": 1.6079501942190473, "learning_rate": 3.0007204184886524e-05, "loss": 0.255, "step": 4451 }, { "epoch": 0.34661087051724976, "grad_norm": 1.4287091133154661, "learning_rate": 3.0002881881675424e-05, "loss": 0.2505, "step": 4452 }, { "epoch": 0.3466887256094594, "grad_norm": 1.5650917297781652, "learning_rate": 2.9998558955331805e-05, "loss": 0.2879, "step": 4453 }, { "epoch": 0.346766580701669, "grad_norm": 1.687624389171991, "learning_rate": 2.999423540612496e-05, "loss": 0.29, "step": 4454 }, { "epoch": 0.34684443579387864, "grad_norm": 1.6106774319468937, "learning_rate": 2.9989911234324224e-05, "loss": 0.2936, "step": 4455 }, { "epoch": 0.3469222908860883, "grad_norm": 1.5208135423527593, "learning_rate": 2.9985586440198982e-05, "loss": 0.2706, "step": 4456 }, { "epoch": 0.34700014597829787, "grad_norm": 1.5602254292243922, "learning_rate": 2.9981261024018633e-05, "loss": 0.2258, "step": 4457 }, { "epoch": 0.3470780010705075, "grad_norm": 1.5920263147092077, "learning_rate": 2.997693498605264e-05, "loss": 0.2705, "step": 4458 }, { "epoch": 0.34715585616271716, "grad_norm": 1.548466712213371, "learning_rate": 2.997260832657049e-05, "loss": 0.2675, "step": 4459 }, { "epoch": 0.34723371125492675, "grad_norm": 1.6167228406935066, "learning_rate": 2.9968281045841725e-05, "loss": 0.256, "step": 4460 }, { "epoch": 0.3473115663471364, "grad_norm": 1.5477564561927557, "learning_rate": 2.99639531441359e-05, "loss": 0.278, "step": 4461 }, { "epoch": 0.34738942143934604, "grad_norm": 1.5114031063616922, "learning_rate": 2.9959624621722633e-05, "loss": 0.2614, "step": 4462 }, { "epoch": 0.3474672765315556, "grad_norm": 1.456114275625913, "learning_rate": 2.995529547887156e-05, "loss": 0.253, "step": 4463 }, { "epoch": 0.34754513162376527, "grad_norm": 1.5944919036223912, "learning_rate": 2.9950965715852378e-05, "loss": 0.2576, "step": 4464 }, { "epoch": 0.3476229867159749, "grad_norm": 1.6065291850417218, "learning_rate": 2.9946635332934802e-05, "loss": 0.306, "step": 4465 }, { "epoch": 0.3477008418081845, "grad_norm": 1.5886639992427312, "learning_rate": 2.9942304330388607e-05, "loss": 0.2615, "step": 4466 }, { "epoch": 0.34777869690039415, "grad_norm": 1.7065328466969742, "learning_rate": 2.9937972708483575e-05, "loss": 0.2646, "step": 4467 }, { "epoch": 0.3478565519926038, "grad_norm": 1.6220504458083185, "learning_rate": 2.993364046748956e-05, "loss": 0.2901, "step": 4468 }, { "epoch": 0.3479344070848134, "grad_norm": 1.5653847900585256, "learning_rate": 2.9929307607676433e-05, "loss": 0.2389, "step": 4469 }, { "epoch": 0.348012262177023, "grad_norm": 1.474485637360599, "learning_rate": 2.9924974129314122e-05, "loss": 0.2332, "step": 4470 }, { "epoch": 0.34809011726923267, "grad_norm": 1.6099801557099758, "learning_rate": 2.9920640032672567e-05, "loss": 0.2612, "step": 4471 }, { "epoch": 0.34816797236144226, "grad_norm": 1.5794845076340003, "learning_rate": 2.991630531802177e-05, "loss": 0.2545, "step": 4472 }, { "epoch": 0.3482458274536519, "grad_norm": 1.6196929942232816, "learning_rate": 2.991196998563176e-05, "loss": 0.2706, "step": 4473 }, { "epoch": 0.3483236825458615, "grad_norm": 1.5810390720686973, "learning_rate": 2.990763403577261e-05, "loss": 0.2694, "step": 4474 }, { "epoch": 0.34840153763807113, "grad_norm": 1.535900868389894, "learning_rate": 2.990329746871444e-05, "loss": 0.2088, "step": 4475 }, { "epoch": 0.3484793927302808, "grad_norm": 1.5785483276408336, "learning_rate": 2.9898960284727378e-05, "loss": 0.246, "step": 4476 }, { "epoch": 0.34855724782249037, "grad_norm": 1.557049360474312, "learning_rate": 2.9894622484081623e-05, "loss": 0.2662, "step": 4477 }, { "epoch": 0.3486351029147, "grad_norm": 1.673444036241734, "learning_rate": 2.9890284067047388e-05, "loss": 0.2604, "step": 4478 }, { "epoch": 0.34871295800690966, "grad_norm": 1.5429509546889815, "learning_rate": 2.9885945033894956e-05, "loss": 0.2274, "step": 4479 }, { "epoch": 0.34879081309911925, "grad_norm": 1.5298235241818765, "learning_rate": 2.9881605384894613e-05, "loss": 0.2819, "step": 4480 }, { "epoch": 0.3488686681913289, "grad_norm": 1.5147831163252972, "learning_rate": 2.9877265120316697e-05, "loss": 0.243, "step": 4481 }, { "epoch": 0.34894652328353853, "grad_norm": 1.6018913026936692, "learning_rate": 2.9872924240431595e-05, "loss": 0.2557, "step": 4482 }, { "epoch": 0.3490243783757481, "grad_norm": 1.4762213375805653, "learning_rate": 2.986858274550972e-05, "loss": 0.2573, "step": 4483 }, { "epoch": 0.34910223346795777, "grad_norm": 1.5411649380715822, "learning_rate": 2.9864240635821525e-05, "loss": 0.2524, "step": 4484 }, { "epoch": 0.3491800885601674, "grad_norm": 1.4693794609996682, "learning_rate": 2.985989791163751e-05, "loss": 0.2233, "step": 4485 }, { "epoch": 0.349257943652377, "grad_norm": 1.5211862987762403, "learning_rate": 2.9855554573228192e-05, "loss": 0.2942, "step": 4486 }, { "epoch": 0.34933579874458665, "grad_norm": 1.52207158319431, "learning_rate": 2.9851210620864153e-05, "loss": 0.2474, "step": 4487 }, { "epoch": 0.3494136538367963, "grad_norm": 1.4747199855083826, "learning_rate": 2.9846866054816e-05, "loss": 0.2261, "step": 4488 }, { "epoch": 0.3494915089290059, "grad_norm": 1.7386472099914831, "learning_rate": 2.9842520875354374e-05, "loss": 0.3056, "step": 4489 }, { "epoch": 0.3495693640212155, "grad_norm": 1.5641312566045609, "learning_rate": 2.983817508274996e-05, "loss": 0.2721, "step": 4490 }, { "epoch": 0.3496472191134251, "grad_norm": 1.5253569732061327, "learning_rate": 2.9833828677273476e-05, "loss": 0.3135, "step": 4491 }, { "epoch": 0.34972507420563476, "grad_norm": 1.5092980728940524, "learning_rate": 2.9829481659195698e-05, "loss": 0.2443, "step": 4492 }, { "epoch": 0.3498029292978444, "grad_norm": 1.6237966891725575, "learning_rate": 2.9825134028787414e-05, "loss": 0.2675, "step": 4493 }, { "epoch": 0.349880784390054, "grad_norm": 1.4984415266790798, "learning_rate": 2.9820785786319462e-05, "loss": 0.2152, "step": 4494 }, { "epoch": 0.34995863948226363, "grad_norm": 1.5968314409349798, "learning_rate": 2.981643693206272e-05, "loss": 0.2545, "step": 4495 }, { "epoch": 0.3500364945744733, "grad_norm": 1.5371576125933148, "learning_rate": 2.981208746628809e-05, "loss": 0.2782, "step": 4496 }, { "epoch": 0.35011434966668287, "grad_norm": 1.6244798983815345, "learning_rate": 2.9807737389266534e-05, "loss": 0.289, "step": 4497 }, { "epoch": 0.3501922047588925, "grad_norm": 1.6672632398500142, "learning_rate": 2.980338670126904e-05, "loss": 0.2635, "step": 4498 }, { "epoch": 0.35027005985110216, "grad_norm": 1.5843432447432289, "learning_rate": 2.9799035402566638e-05, "loss": 0.2461, "step": 4499 }, { "epoch": 0.35034791494331174, "grad_norm": 1.5217146265915358, "learning_rate": 2.9794683493430388e-05, "loss": 0.2377, "step": 4500 }, { "epoch": 0.35034791494331174, "eval_loss": 0.032623715698719025, "eval_runtime": 162.0125, "eval_samples_per_second": 17.776, "eval_steps_per_second": 0.636, "step": 4500 }, { "epoch": 0.3504257700355214, "grad_norm": 1.5317280160858748, "learning_rate": 2.9790330974131393e-05, "loss": 0.248, "step": 4501 }, { "epoch": 0.35050362512773103, "grad_norm": 1.6347901358854644, "learning_rate": 2.97859778449408e-05, "loss": 0.2521, "step": 4502 }, { "epoch": 0.3505814802199406, "grad_norm": 1.6152353367218448, "learning_rate": 2.9781624106129788e-05, "loss": 0.2587, "step": 4503 }, { "epoch": 0.35065933531215027, "grad_norm": 1.5226917836974077, "learning_rate": 2.9777269757969566e-05, "loss": 0.2514, "step": 4504 }, { "epoch": 0.3507371904043599, "grad_norm": 1.551706702998522, "learning_rate": 2.9772914800731398e-05, "loss": 0.2791, "step": 4505 }, { "epoch": 0.3508150454965695, "grad_norm": 1.4442701266206108, "learning_rate": 2.9768559234686574e-05, "loss": 0.2233, "step": 4506 }, { "epoch": 0.35089290058877914, "grad_norm": 1.6493504800615777, "learning_rate": 2.9764203060106424e-05, "loss": 0.2468, "step": 4507 }, { "epoch": 0.35097075568098873, "grad_norm": 1.4304319707160795, "learning_rate": 2.9759846277262325e-05, "loss": 0.2606, "step": 4508 }, { "epoch": 0.3510486107731984, "grad_norm": 1.6354842076215146, "learning_rate": 2.9755488886425672e-05, "loss": 0.261, "step": 4509 }, { "epoch": 0.351126465865408, "grad_norm": 1.5204841920222127, "learning_rate": 2.9751130887867917e-05, "loss": 0.237, "step": 4510 }, { "epoch": 0.3512043209576176, "grad_norm": 1.639443301180757, "learning_rate": 2.9746772281860548e-05, "loss": 0.2521, "step": 4511 }, { "epoch": 0.35128217604982725, "grad_norm": 1.373763193582879, "learning_rate": 2.9742413068675074e-05, "loss": 0.2562, "step": 4512 }, { "epoch": 0.3513600311420369, "grad_norm": 1.4730835142408185, "learning_rate": 2.9738053248583062e-05, "loss": 0.2452, "step": 4513 }, { "epoch": 0.3514378862342465, "grad_norm": 1.5813820129352565, "learning_rate": 2.97336928218561e-05, "loss": 0.2637, "step": 4514 }, { "epoch": 0.35151574132645613, "grad_norm": 1.4081017549661983, "learning_rate": 2.9729331788765834e-05, "loss": 0.23, "step": 4515 }, { "epoch": 0.3515935964186658, "grad_norm": 1.5690954361601432, "learning_rate": 2.972497014958393e-05, "loss": 0.2649, "step": 4516 }, { "epoch": 0.35167145151087537, "grad_norm": 1.4473430452336968, "learning_rate": 2.9720607904582094e-05, "loss": 0.2521, "step": 4517 }, { "epoch": 0.351749306603085, "grad_norm": 1.604150122369739, "learning_rate": 2.9716245054032077e-05, "loss": 0.2377, "step": 4518 }, { "epoch": 0.35182716169529465, "grad_norm": 1.5051590019357273, "learning_rate": 2.971188159820566e-05, "loss": 0.245, "step": 4519 }, { "epoch": 0.35190501678750424, "grad_norm": 1.591568229087452, "learning_rate": 2.9707517537374676e-05, "loss": 0.2616, "step": 4520 }, { "epoch": 0.3519828718797139, "grad_norm": 1.63478717786098, "learning_rate": 2.9703152871810978e-05, "loss": 0.2924, "step": 4521 }, { "epoch": 0.35206072697192353, "grad_norm": 1.4899086764707874, "learning_rate": 2.9698787601786462e-05, "loss": 0.245, "step": 4522 }, { "epoch": 0.3521385820641331, "grad_norm": 1.5921798782047725, "learning_rate": 2.9694421727573067e-05, "loss": 0.2487, "step": 4523 }, { "epoch": 0.35221643715634277, "grad_norm": 1.6572861646278918, "learning_rate": 2.969005524944277e-05, "loss": 0.264, "step": 4524 }, { "epoch": 0.35229429224855235, "grad_norm": 1.6026107398143739, "learning_rate": 2.968568816766757e-05, "loss": 0.2541, "step": 4525 }, { "epoch": 0.352372147340762, "grad_norm": 1.5993815506820541, "learning_rate": 2.9681320482519526e-05, "loss": 0.2726, "step": 4526 }, { "epoch": 0.35245000243297164, "grad_norm": 1.4940062155865805, "learning_rate": 2.9676952194270726e-05, "loss": 0.2299, "step": 4527 }, { "epoch": 0.35252785752518123, "grad_norm": 1.6171680436209546, "learning_rate": 2.9672583303193293e-05, "loss": 0.3123, "step": 4528 }, { "epoch": 0.3526057126173909, "grad_norm": 1.6595878776486166, "learning_rate": 2.966821380955938e-05, "loss": 0.2527, "step": 4529 }, { "epoch": 0.3526835677096005, "grad_norm": 1.7834810680708075, "learning_rate": 2.966384371364119e-05, "loss": 0.2562, "step": 4530 }, { "epoch": 0.3527614228018101, "grad_norm": 1.3766276243827147, "learning_rate": 2.9659473015710963e-05, "loss": 0.2561, "step": 4531 }, { "epoch": 0.35283927789401975, "grad_norm": 1.579826920007064, "learning_rate": 2.9655101716040972e-05, "loss": 0.2567, "step": 4532 }, { "epoch": 0.3529171329862294, "grad_norm": 1.6146363809830966, "learning_rate": 2.9650729814903524e-05, "loss": 0.265, "step": 4533 }, { "epoch": 0.352994988078439, "grad_norm": 1.5023367063907085, "learning_rate": 2.9646357312570974e-05, "loss": 0.2295, "step": 4534 }, { "epoch": 0.35307284317064863, "grad_norm": 1.4324824726002188, "learning_rate": 2.9641984209315703e-05, "loss": 0.2148, "step": 4535 }, { "epoch": 0.3531506982628583, "grad_norm": 1.604584257180358, "learning_rate": 2.9637610505410136e-05, "loss": 0.2494, "step": 4536 }, { "epoch": 0.35322855335506786, "grad_norm": 1.5836681076730126, "learning_rate": 2.9633236201126738e-05, "loss": 0.2246, "step": 4537 }, { "epoch": 0.3533064084472775, "grad_norm": 1.5097836306502397, "learning_rate": 2.962886129673801e-05, "loss": 0.2384, "step": 4538 }, { "epoch": 0.35338426353948715, "grad_norm": 1.5289500304674863, "learning_rate": 2.9624485792516477e-05, "loss": 0.2221, "step": 4539 }, { "epoch": 0.35346211863169674, "grad_norm": 1.3935337468126683, "learning_rate": 2.9620109688734713e-05, "loss": 0.2245, "step": 4540 }, { "epoch": 0.3535399737239064, "grad_norm": 1.5692889636126681, "learning_rate": 2.9615732985665345e-05, "loss": 0.2807, "step": 4541 }, { "epoch": 0.35361782881611603, "grad_norm": 1.653904491039991, "learning_rate": 2.9611355683581006e-05, "loss": 0.2735, "step": 4542 }, { "epoch": 0.3536956839083256, "grad_norm": 1.4245393469800347, "learning_rate": 2.9606977782754378e-05, "loss": 0.2425, "step": 4543 }, { "epoch": 0.35377353900053526, "grad_norm": 1.4432832784345397, "learning_rate": 2.9602599283458198e-05, "loss": 0.2356, "step": 4544 }, { "epoch": 0.35385139409274485, "grad_norm": 1.5154778714497685, "learning_rate": 2.9598220185965215e-05, "loss": 0.2616, "step": 4545 }, { "epoch": 0.3539292491849545, "grad_norm": 1.5330331038231135, "learning_rate": 2.9593840490548236e-05, "loss": 0.2535, "step": 4546 }, { "epoch": 0.35400710427716414, "grad_norm": 1.6248263641000602, "learning_rate": 2.9589460197480083e-05, "loss": 0.266, "step": 4547 }, { "epoch": 0.35408495936937373, "grad_norm": 1.4591662299203294, "learning_rate": 2.9585079307033636e-05, "loss": 0.2515, "step": 4548 }, { "epoch": 0.3541628144615834, "grad_norm": 1.5401638191979268, "learning_rate": 2.9580697819481803e-05, "loss": 0.2674, "step": 4549 }, { "epoch": 0.354240669553793, "grad_norm": 1.5865513935980262, "learning_rate": 2.957631573509753e-05, "loss": 0.2822, "step": 4550 }, { "epoch": 0.354240669553793, "eval_loss": 0.03186272084712982, "eval_runtime": 162.0666, "eval_samples_per_second": 17.77, "eval_steps_per_second": 0.636, "step": 4550 }, { "epoch": 0.3543185246460026, "grad_norm": 1.5365542360485847, "learning_rate": 2.9571933054153802e-05, "loss": 0.2776, "step": 4551 }, { "epoch": 0.35439637973821225, "grad_norm": 1.4442944956019401, "learning_rate": 2.956754977692363e-05, "loss": 0.2234, "step": 4552 }, { "epoch": 0.3544742348304219, "grad_norm": 1.579370665706068, "learning_rate": 2.956316590368008e-05, "loss": 0.2753, "step": 4553 }, { "epoch": 0.3545520899226315, "grad_norm": 1.7988896095803915, "learning_rate": 2.9558781434696245e-05, "loss": 0.2949, "step": 4554 }, { "epoch": 0.35462994501484113, "grad_norm": 1.465519745484643, "learning_rate": 2.955439637024526e-05, "loss": 0.2744, "step": 4555 }, { "epoch": 0.3547078001070508, "grad_norm": 1.5911615508387769, "learning_rate": 2.9550010710600294e-05, "loss": 0.275, "step": 4556 }, { "epoch": 0.35478565519926036, "grad_norm": 1.5163254695552806, "learning_rate": 2.9545624456034546e-05, "loss": 0.2566, "step": 4557 }, { "epoch": 0.35486351029147, "grad_norm": 1.3789255379108107, "learning_rate": 2.9541237606821256e-05, "loss": 0.2286, "step": 4558 }, { "epoch": 0.35494136538367965, "grad_norm": 1.4971851736334592, "learning_rate": 2.953685016323372e-05, "loss": 0.2325, "step": 4559 }, { "epoch": 0.35501922047588924, "grad_norm": 1.47603913531154, "learning_rate": 2.953246212554524e-05, "loss": 0.2276, "step": 4560 }, { "epoch": 0.3550970755680989, "grad_norm": 1.6187409488024125, "learning_rate": 2.952807349402918e-05, "loss": 0.2618, "step": 4561 }, { "epoch": 0.3551749306603085, "grad_norm": 1.6543819109786075, "learning_rate": 2.9523684268958927e-05, "loss": 0.2978, "step": 4562 }, { "epoch": 0.3552527857525181, "grad_norm": 1.6619918636862536, "learning_rate": 2.9519294450607905e-05, "loss": 0.2574, "step": 4563 }, { "epoch": 0.35533064084472776, "grad_norm": 1.6077564417218688, "learning_rate": 2.9514904039249588e-05, "loss": 0.2422, "step": 4564 }, { "epoch": 0.35540849593693735, "grad_norm": 1.5837324443221823, "learning_rate": 2.951051303515747e-05, "loss": 0.2493, "step": 4565 }, { "epoch": 0.355486351029147, "grad_norm": 1.5435279997339295, "learning_rate": 2.95061214386051e-05, "loss": 0.2554, "step": 4566 }, { "epoch": 0.35556420612135664, "grad_norm": 1.5906254046882944, "learning_rate": 2.950172924986604e-05, "loss": 0.2522, "step": 4567 }, { "epoch": 0.35564206121356623, "grad_norm": 1.5512372830268757, "learning_rate": 2.949733646921391e-05, "loss": 0.2528, "step": 4568 }, { "epoch": 0.3557199163057759, "grad_norm": 1.5670404569681908, "learning_rate": 2.9492943096922363e-05, "loss": 0.2713, "step": 4569 }, { "epoch": 0.3557977713979855, "grad_norm": 1.5428706289484913, "learning_rate": 2.9488549133265084e-05, "loss": 0.2506, "step": 4570 }, { "epoch": 0.3558756264901951, "grad_norm": 1.4746674416533248, "learning_rate": 2.948415457851579e-05, "loss": 0.2596, "step": 4571 }, { "epoch": 0.35595348158240475, "grad_norm": 1.5696149974046003, "learning_rate": 2.9479759432948243e-05, "loss": 0.2551, "step": 4572 }, { "epoch": 0.3560313366746144, "grad_norm": 1.631042041967869, "learning_rate": 2.9475363696836243e-05, "loss": 0.2832, "step": 4573 }, { "epoch": 0.356109191766824, "grad_norm": 1.5008642754115236, "learning_rate": 2.9470967370453626e-05, "loss": 0.2355, "step": 4574 }, { "epoch": 0.35618704685903363, "grad_norm": 1.6702950361824236, "learning_rate": 2.9466570454074263e-05, "loss": 0.2507, "step": 4575 }, { "epoch": 0.3562649019512433, "grad_norm": 1.5075570996193335, "learning_rate": 2.9462172947972045e-05, "loss": 0.2258, "step": 4576 }, { "epoch": 0.35634275704345286, "grad_norm": 1.5871022967846709, "learning_rate": 2.945777485242094e-05, "loss": 0.2543, "step": 4577 }, { "epoch": 0.3564206121356625, "grad_norm": 1.4397928254368595, "learning_rate": 2.945337616769491e-05, "loss": 0.243, "step": 4578 }, { "epoch": 0.3564984672278721, "grad_norm": 1.5396386682698662, "learning_rate": 2.9448976894067985e-05, "loss": 0.2614, "step": 4579 }, { "epoch": 0.35657632232008174, "grad_norm": 1.667135297938401, "learning_rate": 2.944457703181421e-05, "loss": 0.302, "step": 4580 }, { "epoch": 0.3566541774122914, "grad_norm": 1.4928306619928589, "learning_rate": 2.944017658120768e-05, "loss": 0.2315, "step": 4581 }, { "epoch": 0.356732032504501, "grad_norm": 1.5849953208818774, "learning_rate": 2.9435775542522528e-05, "loss": 0.2561, "step": 4582 }, { "epoch": 0.3568098875967106, "grad_norm": 1.5232225340963943, "learning_rate": 2.9431373916032902e-05, "loss": 0.2365, "step": 4583 }, { "epoch": 0.35688774268892026, "grad_norm": 1.4822273087232645, "learning_rate": 2.942697170201302e-05, "loss": 0.2201, "step": 4584 }, { "epoch": 0.35696559778112985, "grad_norm": 1.4837681320881426, "learning_rate": 2.9422568900737104e-05, "loss": 0.2327, "step": 4585 }, { "epoch": 0.3570434528733395, "grad_norm": 1.606438223701083, "learning_rate": 2.9418165512479436e-05, "loss": 0.2336, "step": 4586 }, { "epoch": 0.35712130796554914, "grad_norm": 1.7522794294955033, "learning_rate": 2.941376153751433e-05, "loss": 0.2709, "step": 4587 }, { "epoch": 0.35719916305775873, "grad_norm": 1.452231088923393, "learning_rate": 2.9409356976116133e-05, "loss": 0.2339, "step": 4588 }, { "epoch": 0.3572770181499684, "grad_norm": 1.5228031213980888, "learning_rate": 2.940495182855922e-05, "loss": 0.2473, "step": 4589 }, { "epoch": 0.357354873242178, "grad_norm": 1.5091536227927451, "learning_rate": 2.9400546095118015e-05, "loss": 0.2192, "step": 4590 }, { "epoch": 0.3574327283343876, "grad_norm": 1.4679655505145452, "learning_rate": 2.9396139776066973e-05, "loss": 0.2357, "step": 4591 }, { "epoch": 0.35751058342659725, "grad_norm": 1.5414034390848843, "learning_rate": 2.9391732871680592e-05, "loss": 0.2594, "step": 4592 }, { "epoch": 0.3575884385188069, "grad_norm": 1.582336387111012, "learning_rate": 2.93873253822334e-05, "loss": 0.2306, "step": 4593 }, { "epoch": 0.3576662936110165, "grad_norm": 1.564852438836569, "learning_rate": 2.9382917307999953e-05, "loss": 0.2387, "step": 4594 }, { "epoch": 0.35774414870322613, "grad_norm": 1.5520370662769636, "learning_rate": 2.9378508649254866e-05, "loss": 0.2469, "step": 4595 }, { "epoch": 0.3578220037954358, "grad_norm": 1.463151176091164, "learning_rate": 2.9374099406272768e-05, "loss": 0.2497, "step": 4596 }, { "epoch": 0.35789985888764536, "grad_norm": 1.5336852708799469, "learning_rate": 2.9369689579328348e-05, "loss": 0.2606, "step": 4597 }, { "epoch": 0.357977713979855, "grad_norm": 1.5735927634763027, "learning_rate": 2.9365279168696306e-05, "loss": 0.2399, "step": 4598 }, { "epoch": 0.3580555690720646, "grad_norm": 1.5053888236200421, "learning_rate": 2.9360868174651388e-05, "loss": 0.2436, "step": 4599 }, { "epoch": 0.35813342416427424, "grad_norm": 1.4388250279653616, "learning_rate": 2.9356456597468386e-05, "loss": 0.2379, "step": 4600 }, { "epoch": 0.35813342416427424, "eval_loss": 0.031361065804958344, "eval_runtime": 162.208, "eval_samples_per_second": 17.755, "eval_steps_per_second": 0.635, "step": 4600 }, { "epoch": 0.3582112792564839, "grad_norm": 1.5538677964640157, "learning_rate": 2.935204443742212e-05, "loss": 0.2373, "step": 4601 }, { "epoch": 0.35828913434869347, "grad_norm": 1.5047067953962983, "learning_rate": 2.934763169478744e-05, "loss": 0.2493, "step": 4602 }, { "epoch": 0.3583669894409031, "grad_norm": 1.6548554139964053, "learning_rate": 2.9343218369839243e-05, "loss": 0.2821, "step": 4603 }, { "epoch": 0.35844484453311276, "grad_norm": 1.5275826981116265, "learning_rate": 2.9338804462852462e-05, "loss": 0.2805, "step": 4604 }, { "epoch": 0.35852269962532235, "grad_norm": 1.6394586493787406, "learning_rate": 2.9334389974102057e-05, "loss": 0.258, "step": 4605 }, { "epoch": 0.358600554717532, "grad_norm": 1.3614807388437478, "learning_rate": 2.932997490386303e-05, "loss": 0.2251, "step": 4606 }, { "epoch": 0.35867840980974164, "grad_norm": 1.613034626569106, "learning_rate": 2.932555925241042e-05, "loss": 0.2359, "step": 4607 }, { "epoch": 0.3587562649019512, "grad_norm": 1.4894725481123654, "learning_rate": 2.9321143020019312e-05, "loss": 0.2475, "step": 4608 }, { "epoch": 0.35883411999416087, "grad_norm": 1.6586400570459305, "learning_rate": 2.93167262069648e-05, "loss": 0.2523, "step": 4609 }, { "epoch": 0.3589119750863705, "grad_norm": 1.4196127013179451, "learning_rate": 2.9312308813522036e-05, "loss": 0.2512, "step": 4610 }, { "epoch": 0.3589898301785801, "grad_norm": 1.4799819572530748, "learning_rate": 2.9307890839966213e-05, "loss": 0.2334, "step": 4611 }, { "epoch": 0.35906768527078975, "grad_norm": 1.666067514634915, "learning_rate": 2.9303472286572532e-05, "loss": 0.2608, "step": 4612 }, { "epoch": 0.3591455403629994, "grad_norm": 1.5105753705249783, "learning_rate": 2.9299053153616266e-05, "loss": 0.2323, "step": 4613 }, { "epoch": 0.359223395455209, "grad_norm": 1.4931586750252535, "learning_rate": 2.929463344137269e-05, "loss": 0.2359, "step": 4614 }, { "epoch": 0.3593012505474186, "grad_norm": 1.3849659097198777, "learning_rate": 2.9290213150117144e-05, "loss": 0.2105, "step": 4615 }, { "epoch": 0.3593791056396282, "grad_norm": 1.6207651157796004, "learning_rate": 2.9285792280124985e-05, "loss": 0.2836, "step": 4616 }, { "epoch": 0.35945696073183786, "grad_norm": 1.5630599526798756, "learning_rate": 2.9281370831671617e-05, "loss": 0.2868, "step": 4617 }, { "epoch": 0.3595348158240475, "grad_norm": 1.340915182127943, "learning_rate": 2.927694880503247e-05, "loss": 0.2352, "step": 4618 }, { "epoch": 0.3596126709162571, "grad_norm": 1.5226925254690968, "learning_rate": 2.927252620048302e-05, "loss": 0.2425, "step": 4619 }, { "epoch": 0.35969052600846674, "grad_norm": 1.5871144431960953, "learning_rate": 2.9268103018298765e-05, "loss": 0.2632, "step": 4620 }, { "epoch": 0.3597683811006764, "grad_norm": 1.4314571673560132, "learning_rate": 2.9263679258755262e-05, "loss": 0.2191, "step": 4621 }, { "epoch": 0.35984623619288597, "grad_norm": 1.5317694506664161, "learning_rate": 2.9259254922128083e-05, "loss": 0.2161, "step": 4622 }, { "epoch": 0.3599240912850956, "grad_norm": 1.460288938823212, "learning_rate": 2.9254830008692847e-05, "loss": 0.2231, "step": 4623 }, { "epoch": 0.36000194637730526, "grad_norm": 1.5177349912305986, "learning_rate": 2.92504045187252e-05, "loss": 0.2238, "step": 4624 }, { "epoch": 0.36007980146951485, "grad_norm": 1.4159462960003357, "learning_rate": 2.9245978452500834e-05, "loss": 0.2372, "step": 4625 }, { "epoch": 0.3601576565617245, "grad_norm": 1.483672047776705, "learning_rate": 2.924155181029547e-05, "loss": 0.2821, "step": 4626 }, { "epoch": 0.36023551165393414, "grad_norm": 1.4946244319736464, "learning_rate": 2.9237124592384866e-05, "loss": 0.2464, "step": 4627 }, { "epoch": 0.3603133667461437, "grad_norm": 1.4759720160732874, "learning_rate": 2.9232696799044822e-05, "loss": 0.2385, "step": 4628 }, { "epoch": 0.36039122183835337, "grad_norm": 1.5517710131584803, "learning_rate": 2.9228268430551162e-05, "loss": 0.2472, "step": 4629 }, { "epoch": 0.360469076930563, "grad_norm": 1.545617559547491, "learning_rate": 2.9223839487179754e-05, "loss": 0.2527, "step": 4630 }, { "epoch": 0.3605469320227726, "grad_norm": 1.5505074242542187, "learning_rate": 2.9219409969206504e-05, "loss": 0.2939, "step": 4631 }, { "epoch": 0.36062478711498225, "grad_norm": 1.6029059612604832, "learning_rate": 2.921497987690735e-05, "loss": 0.2711, "step": 4632 }, { "epoch": 0.36070264220719184, "grad_norm": 1.5923792922446027, "learning_rate": 2.9210549210558264e-05, "loss": 0.2806, "step": 4633 }, { "epoch": 0.3607804972994015, "grad_norm": 1.6625456997641361, "learning_rate": 2.9206117970435254e-05, "loss": 0.2923, "step": 4634 }, { "epoch": 0.3608583523916111, "grad_norm": 1.6117586767456187, "learning_rate": 2.920168615681436e-05, "loss": 0.2261, "step": 4635 }, { "epoch": 0.3609362074838207, "grad_norm": 1.4261707751782242, "learning_rate": 2.9197253769971676e-05, "loss": 0.2455, "step": 4636 }, { "epoch": 0.36101406257603036, "grad_norm": 1.5334528551786872, "learning_rate": 2.919282081018332e-05, "loss": 0.2292, "step": 4637 }, { "epoch": 0.36109191766824, "grad_norm": 1.568087236570913, "learning_rate": 2.9188387277725428e-05, "loss": 0.2559, "step": 4638 }, { "epoch": 0.3611697727604496, "grad_norm": 1.5047835722446739, "learning_rate": 2.91839531728742e-05, "loss": 0.2273, "step": 4639 }, { "epoch": 0.36124762785265924, "grad_norm": 1.6133429894565678, "learning_rate": 2.9179518495905857e-05, "loss": 0.2415, "step": 4640 }, { "epoch": 0.3613254829448689, "grad_norm": 1.6058907573938233, "learning_rate": 2.9175083247096663e-05, "loss": 0.2515, "step": 4641 }, { "epoch": 0.36140333803707847, "grad_norm": 1.4882593133070827, "learning_rate": 2.917064742672291e-05, "loss": 0.2397, "step": 4642 }, { "epoch": 0.3614811931292881, "grad_norm": 1.5838604915282692, "learning_rate": 2.9166211035060922e-05, "loss": 0.2371, "step": 4643 }, { "epoch": 0.36155904822149776, "grad_norm": 1.5584952275475608, "learning_rate": 2.9161774072387078e-05, "loss": 0.2504, "step": 4644 }, { "epoch": 0.36163690331370735, "grad_norm": 1.5649863160281339, "learning_rate": 2.9157336538977765e-05, "loss": 0.2581, "step": 4645 }, { "epoch": 0.361714758405917, "grad_norm": 1.426273963607534, "learning_rate": 2.9152898435109435e-05, "loss": 0.2451, "step": 4646 }, { "epoch": 0.36179261349812664, "grad_norm": 1.4693532738326103, "learning_rate": 2.9148459761058556e-05, "loss": 0.2295, "step": 4647 }, { "epoch": 0.3618704685903362, "grad_norm": 1.7145644534027003, "learning_rate": 2.9144020517101633e-05, "loss": 0.2774, "step": 4648 }, { "epoch": 0.36194832368254587, "grad_norm": 1.6084274705348542, "learning_rate": 2.9139580703515215e-05, "loss": 0.2693, "step": 4649 }, { "epoch": 0.3620261787747555, "grad_norm": 1.5557896178477157, "learning_rate": 2.9135140320575873e-05, "loss": 0.2498, "step": 4650 }, { "epoch": 0.3620261787747555, "eval_loss": 0.030784448608756065, "eval_runtime": 162.5113, "eval_samples_per_second": 17.722, "eval_steps_per_second": 0.634, "step": 4650 }, { "epoch": 0.3621040338669651, "grad_norm": 1.6010540785840162, "learning_rate": 2.9130699368560233e-05, "loss": 0.2617, "step": 4651 }, { "epoch": 0.36218188895917475, "grad_norm": 1.5167072407875157, "learning_rate": 2.9126257847744945e-05, "loss": 0.2371, "step": 4652 }, { "epoch": 0.36225974405138434, "grad_norm": 1.5788147340156031, "learning_rate": 2.912181575840668e-05, "loss": 0.2603, "step": 4653 }, { "epoch": 0.362337599143594, "grad_norm": 1.3951562687301844, "learning_rate": 2.9117373100822175e-05, "loss": 0.2337, "step": 4654 }, { "epoch": 0.3624154542358036, "grad_norm": 1.4436501049066208, "learning_rate": 2.9112929875268184e-05, "loss": 0.2395, "step": 4655 }, { "epoch": 0.3624933093280132, "grad_norm": 1.5763233300075739, "learning_rate": 2.910848608202149e-05, "loss": 0.2466, "step": 4656 }, { "epoch": 0.36257116442022286, "grad_norm": 1.5290897023951335, "learning_rate": 2.910404172135893e-05, "loss": 0.2588, "step": 4657 }, { "epoch": 0.3626490195124325, "grad_norm": 1.5532714890525916, "learning_rate": 2.909959679355736e-05, "loss": 0.2361, "step": 4658 }, { "epoch": 0.3627268746046421, "grad_norm": 1.5181681215242877, "learning_rate": 2.9095151298893687e-05, "loss": 0.218, "step": 4659 }, { "epoch": 0.36280472969685174, "grad_norm": 1.5471053534966834, "learning_rate": 2.9090705237644837e-05, "loss": 0.225, "step": 4660 }, { "epoch": 0.3628825847890614, "grad_norm": 1.3879862707668487, "learning_rate": 2.908625861008778e-05, "loss": 0.2031, "step": 4661 }, { "epoch": 0.36296043988127097, "grad_norm": 1.3606410763110686, "learning_rate": 2.9081811416499524e-05, "loss": 0.2158, "step": 4662 }, { "epoch": 0.3630382949734806, "grad_norm": 1.4974749409331742, "learning_rate": 2.9077363657157098e-05, "loss": 0.2754, "step": 4663 }, { "epoch": 0.36311615006569026, "grad_norm": 1.552777531924004, "learning_rate": 2.907291533233759e-05, "loss": 0.2492, "step": 4664 }, { "epoch": 0.36319400515789985, "grad_norm": 1.419694385517668, "learning_rate": 2.9068466442318103e-05, "loss": 0.2158, "step": 4665 }, { "epoch": 0.3632718602501095, "grad_norm": 1.4915148955652775, "learning_rate": 2.9064016987375782e-05, "loss": 0.2235, "step": 4666 }, { "epoch": 0.36334971534231914, "grad_norm": 1.5254126521593705, "learning_rate": 2.9059566967787803e-05, "loss": 0.2762, "step": 4667 }, { "epoch": 0.3634275704345287, "grad_norm": 1.536639803520933, "learning_rate": 2.9055116383831387e-05, "loss": 0.2397, "step": 4668 }, { "epoch": 0.36350542552673837, "grad_norm": 1.5521621853538368, "learning_rate": 2.9050665235783785e-05, "loss": 0.2464, "step": 4669 }, { "epoch": 0.36358328061894796, "grad_norm": 1.4775736622915112, "learning_rate": 2.9046213523922287e-05, "loss": 0.2348, "step": 4670 }, { "epoch": 0.3636611357111576, "grad_norm": 1.5639848201557465, "learning_rate": 2.9041761248524197e-05, "loss": 0.2349, "step": 4671 }, { "epoch": 0.36373899080336725, "grad_norm": 1.4564373518133784, "learning_rate": 2.9037308409866886e-05, "loss": 0.2099, "step": 4672 }, { "epoch": 0.36381684589557683, "grad_norm": 1.6072133767154346, "learning_rate": 2.903285500822773e-05, "loss": 0.2594, "step": 4673 }, { "epoch": 0.3638947009877865, "grad_norm": 1.6242078162547569, "learning_rate": 2.902840104388418e-05, "loss": 0.2331, "step": 4674 }, { "epoch": 0.3639725560799961, "grad_norm": 1.5573486335822473, "learning_rate": 2.902394651711368e-05, "loss": 0.2307, "step": 4675 }, { "epoch": 0.3640504111722057, "grad_norm": 1.4001129253861662, "learning_rate": 2.9019491428193723e-05, "loss": 0.2146, "step": 4676 }, { "epoch": 0.36412826626441536, "grad_norm": 1.621751213865315, "learning_rate": 2.9015035777401847e-05, "loss": 0.286, "step": 4677 }, { "epoch": 0.364206121356625, "grad_norm": 1.6792872952066646, "learning_rate": 2.9010579565015617e-05, "loss": 0.3312, "step": 4678 }, { "epoch": 0.3642839764488346, "grad_norm": 1.6177639310543086, "learning_rate": 2.9006122791312633e-05, "loss": 0.255, "step": 4679 }, { "epoch": 0.36436183154104423, "grad_norm": 1.5694229677830116, "learning_rate": 2.9001665456570537e-05, "loss": 0.2553, "step": 4680 }, { "epoch": 0.3644396866332539, "grad_norm": 1.330128115957676, "learning_rate": 2.8997207561066984e-05, "loss": 0.1932, "step": 4681 }, { "epoch": 0.36451754172546347, "grad_norm": 1.4160300243781847, "learning_rate": 2.8992749105079694e-05, "loss": 0.2181, "step": 4682 }, { "epoch": 0.3645953968176731, "grad_norm": 1.5431752148495375, "learning_rate": 2.898829008888641e-05, "loss": 0.2184, "step": 4683 }, { "epoch": 0.36467325190988276, "grad_norm": 1.4656990126754972, "learning_rate": 2.89838305127649e-05, "loss": 0.2562, "step": 4684 }, { "epoch": 0.36475110700209235, "grad_norm": 1.6222362721934094, "learning_rate": 2.8979370376992976e-05, "loss": 0.2537, "step": 4685 }, { "epoch": 0.364828962094302, "grad_norm": 1.5184744217065558, "learning_rate": 2.8974909681848475e-05, "loss": 0.2272, "step": 4686 }, { "epoch": 0.3649068171865116, "grad_norm": 1.4567469471925176, "learning_rate": 2.89704484276093e-05, "loss": 0.2274, "step": 4687 }, { "epoch": 0.3649846722787212, "grad_norm": 1.6895147758911941, "learning_rate": 2.896598661455335e-05, "loss": 0.245, "step": 4688 }, { "epoch": 0.36506252737093087, "grad_norm": 1.4287714095235158, "learning_rate": 2.8961524242958573e-05, "loss": 0.2298, "step": 4689 }, { "epoch": 0.36514038246314046, "grad_norm": 1.6235420859715501, "learning_rate": 2.8957061313102962e-05, "loss": 0.2609, "step": 4690 }, { "epoch": 0.3652182375553501, "grad_norm": 1.4850588941977334, "learning_rate": 2.895259782526453e-05, "loss": 0.2519, "step": 4691 }, { "epoch": 0.36529609264755974, "grad_norm": 1.3799872085123255, "learning_rate": 2.8948133779721338e-05, "loss": 0.2082, "step": 4692 }, { "epoch": 0.36537394773976933, "grad_norm": 1.6043954193827965, "learning_rate": 2.894366917675147e-05, "loss": 0.2896, "step": 4693 }, { "epoch": 0.365451802831979, "grad_norm": 1.508467173101119, "learning_rate": 2.893920401663305e-05, "loss": 0.2523, "step": 4694 }, { "epoch": 0.3655296579241886, "grad_norm": 1.5975995604663547, "learning_rate": 2.8934738299644244e-05, "loss": 0.2245, "step": 4695 }, { "epoch": 0.3656075130163982, "grad_norm": 1.4814734278193864, "learning_rate": 2.8930272026063234e-05, "loss": 0.2374, "step": 4696 }, { "epoch": 0.36568536810860786, "grad_norm": 1.514019646102173, "learning_rate": 2.892580519616825e-05, "loss": 0.2299, "step": 4697 }, { "epoch": 0.3657632232008175, "grad_norm": 1.6102769321865695, "learning_rate": 2.892133781023756e-05, "loss": 0.2325, "step": 4698 }, { "epoch": 0.3658410782930271, "grad_norm": 1.511458142893096, "learning_rate": 2.891686986854946e-05, "loss": 0.2364, "step": 4699 }, { "epoch": 0.36591893338523673, "grad_norm": 1.4105726088444206, "learning_rate": 2.891240137138228e-05, "loss": 0.2363, "step": 4700 }, { "epoch": 0.36591893338523673, "eval_loss": 0.03014599159359932, "eval_runtime": 162.0111, "eval_samples_per_second": 17.777, "eval_steps_per_second": 0.636, "step": 4700 }, { "epoch": 0.3659967884774464, "grad_norm": 1.537538351879266, "learning_rate": 2.8907932319014386e-05, "loss": 0.248, "step": 4701 }, { "epoch": 0.36607464356965597, "grad_norm": 1.527361432168645, "learning_rate": 2.8903462711724172e-05, "loss": 0.2542, "step": 4702 }, { "epoch": 0.3661524986618656, "grad_norm": 1.4690715041705538, "learning_rate": 2.889899254979009e-05, "loss": 0.2391, "step": 4703 }, { "epoch": 0.36623035375407526, "grad_norm": 1.5729055621582675, "learning_rate": 2.88945218334906e-05, "loss": 0.2795, "step": 4704 }, { "epoch": 0.36630820884628484, "grad_norm": 1.4768931652250208, "learning_rate": 2.889005056310421e-05, "loss": 0.2233, "step": 4705 }, { "epoch": 0.3663860639384945, "grad_norm": 1.4339745851766803, "learning_rate": 2.8885578738909452e-05, "loss": 0.2009, "step": 4706 }, { "epoch": 0.3664639190307041, "grad_norm": 1.5193150730670073, "learning_rate": 2.88811063611849e-05, "loss": 0.235, "step": 4707 }, { "epoch": 0.3665417741229137, "grad_norm": 1.5291683925337856, "learning_rate": 2.887663343020918e-05, "loss": 0.2441, "step": 4708 }, { "epoch": 0.36661962921512337, "grad_norm": 1.5178078256238225, "learning_rate": 2.8872159946260915e-05, "loss": 0.2284, "step": 4709 }, { "epoch": 0.36669748430733295, "grad_norm": 1.5932873396468354, "learning_rate": 2.8867685909618782e-05, "loss": 0.2312, "step": 4710 }, { "epoch": 0.3667753393995426, "grad_norm": 1.5068285233627305, "learning_rate": 2.8863211320561508e-05, "loss": 0.2188, "step": 4711 }, { "epoch": 0.36685319449175224, "grad_norm": 1.442411918794169, "learning_rate": 2.8858736179367823e-05, "loss": 0.2245, "step": 4712 }, { "epoch": 0.36693104958396183, "grad_norm": 1.4020842584191933, "learning_rate": 2.8854260486316516e-05, "loss": 0.225, "step": 4713 }, { "epoch": 0.3670089046761715, "grad_norm": 1.5478530875192495, "learning_rate": 2.8849784241686404e-05, "loss": 0.2538, "step": 4714 }, { "epoch": 0.3670867597683811, "grad_norm": 1.5172459593619436, "learning_rate": 2.8845307445756322e-05, "loss": 0.2314, "step": 4715 }, { "epoch": 0.3671646148605907, "grad_norm": 1.5706823686488913, "learning_rate": 2.8840830098805166e-05, "loss": 0.2249, "step": 4716 }, { "epoch": 0.36724246995280035, "grad_norm": 1.451589764563926, "learning_rate": 2.8836352201111855e-05, "loss": 0.2201, "step": 4717 }, { "epoch": 0.36732032504501, "grad_norm": 1.477476747522838, "learning_rate": 2.8831873752955336e-05, "loss": 0.2141, "step": 4718 }, { "epoch": 0.3673981801372196, "grad_norm": 1.4234153822995985, "learning_rate": 2.8827394754614593e-05, "loss": 0.2327, "step": 4719 }, { "epoch": 0.36747603522942923, "grad_norm": 1.5188509693922, "learning_rate": 2.8822915206368643e-05, "loss": 0.2386, "step": 4720 }, { "epoch": 0.3675538903216389, "grad_norm": 1.4751031538221695, "learning_rate": 2.8818435108496556e-05, "loss": 0.2006, "step": 4721 }, { "epoch": 0.36763174541384847, "grad_norm": 1.480484671231894, "learning_rate": 2.8813954461277405e-05, "loss": 0.2415, "step": 4722 }, { "epoch": 0.3677096005060581, "grad_norm": 1.4777054828420393, "learning_rate": 2.880947326499033e-05, "loss": 0.2875, "step": 4723 }, { "epoch": 0.3677874555982677, "grad_norm": 1.4544086551351285, "learning_rate": 2.8804991519914473e-05, "loss": 0.2132, "step": 4724 }, { "epoch": 0.36786531069047734, "grad_norm": 1.7096024433118797, "learning_rate": 2.880050922632903e-05, "loss": 0.2564, "step": 4725 }, { "epoch": 0.367943165782687, "grad_norm": 1.439092912744828, "learning_rate": 2.879602638451323e-05, "loss": 0.2275, "step": 4726 }, { "epoch": 0.3680210208748966, "grad_norm": 1.459074889745025, "learning_rate": 2.879154299474633e-05, "loss": 0.2328, "step": 4727 }, { "epoch": 0.3680988759671062, "grad_norm": 1.4401568291890472, "learning_rate": 2.8787059057307626e-05, "loss": 0.2036, "step": 4728 }, { "epoch": 0.36817673105931586, "grad_norm": 1.6047953769431207, "learning_rate": 2.878257457247645e-05, "loss": 0.2541, "step": 4729 }, { "epoch": 0.36825458615152545, "grad_norm": 1.4465252662695331, "learning_rate": 2.8778089540532152e-05, "loss": 0.2025, "step": 4730 }, { "epoch": 0.3683324412437351, "grad_norm": 1.411173543144865, "learning_rate": 2.877360396175414e-05, "loss": 0.2256, "step": 4731 }, { "epoch": 0.36841029633594474, "grad_norm": 1.5390819246056784, "learning_rate": 2.876911783642184e-05, "loss": 0.2531, "step": 4732 }, { "epoch": 0.36848815142815433, "grad_norm": 1.6476456688510464, "learning_rate": 2.876463116481472e-05, "loss": 0.2678, "step": 4733 }, { "epoch": 0.368566006520364, "grad_norm": 1.4599988924774494, "learning_rate": 2.8760143947212276e-05, "loss": 0.2512, "step": 4734 }, { "epoch": 0.3686438616125736, "grad_norm": 1.3902169693740123, "learning_rate": 2.8755656183894034e-05, "loss": 0.2006, "step": 4735 }, { "epoch": 0.3687217167047832, "grad_norm": 1.53395099226405, "learning_rate": 2.875116787513957e-05, "loss": 0.215, "step": 4736 }, { "epoch": 0.36879957179699285, "grad_norm": 1.4946619696189263, "learning_rate": 2.8746679021228492e-05, "loss": 0.2138, "step": 4737 }, { "epoch": 0.3688774268892025, "grad_norm": 1.57146396048205, "learning_rate": 2.874218962244041e-05, "loss": 0.2277, "step": 4738 }, { "epoch": 0.3689552819814121, "grad_norm": 1.5409917144603256, "learning_rate": 2.8737699679055013e-05, "loss": 0.2504, "step": 4739 }, { "epoch": 0.36903313707362173, "grad_norm": 1.4701540896227128, "learning_rate": 2.873320919135199e-05, "loss": 0.2412, "step": 4740 }, { "epoch": 0.3691109921658313, "grad_norm": 1.5984281903000679, "learning_rate": 2.872871815961109e-05, "loss": 0.2259, "step": 4741 }, { "epoch": 0.36918884725804096, "grad_norm": 1.4460702771166234, "learning_rate": 2.8724226584112084e-05, "loss": 0.2062, "step": 4742 }, { "epoch": 0.3692667023502506, "grad_norm": 1.3649946276711755, "learning_rate": 2.871973446513476e-05, "loss": 0.2517, "step": 4743 }, { "epoch": 0.3693445574424602, "grad_norm": 1.4247140836929189, "learning_rate": 2.8715241802958965e-05, "loss": 0.2267, "step": 4744 }, { "epoch": 0.36942241253466984, "grad_norm": 1.3992427479334124, "learning_rate": 2.871074859786457e-05, "loss": 0.1914, "step": 4745 }, { "epoch": 0.3695002676268795, "grad_norm": 1.5491488904161457, "learning_rate": 2.870625485013149e-05, "loss": 0.2587, "step": 4746 }, { "epoch": 0.3695781227190891, "grad_norm": 1.4590809005455418, "learning_rate": 2.8701760560039652e-05, "loss": 0.2273, "step": 4747 }, { "epoch": 0.3696559778112987, "grad_norm": 1.5496232839710469, "learning_rate": 2.8697265727869027e-05, "loss": 0.2325, "step": 4748 }, { "epoch": 0.36973383290350836, "grad_norm": 1.5840449606803877, "learning_rate": 2.869277035389963e-05, "loss": 0.2208, "step": 4749 }, { "epoch": 0.36981168799571795, "grad_norm": 1.4932632432597974, "learning_rate": 2.8688274438411504e-05, "loss": 0.2501, "step": 4750 }, { "epoch": 0.36981168799571795, "eval_loss": 0.029694661498069763, "eval_runtime": 162.6895, "eval_samples_per_second": 17.702, "eval_steps_per_second": 0.633, "step": 4750 }, { "epoch": 0.3698895430879276, "grad_norm": 1.4499205339816732, "learning_rate": 2.8683777981684716e-05, "loss": 0.237, "step": 4751 }, { "epoch": 0.36996739818013724, "grad_norm": 1.5330450891493383, "learning_rate": 2.867928098399938e-05, "loss": 0.2335, "step": 4752 }, { "epoch": 0.37004525327234683, "grad_norm": 1.4336613303173593, "learning_rate": 2.867478344563563e-05, "loss": 0.2165, "step": 4753 }, { "epoch": 0.3701231083645565, "grad_norm": 1.6308855010002317, "learning_rate": 2.867028536687365e-05, "loss": 0.2442, "step": 4754 }, { "epoch": 0.3702009634567661, "grad_norm": 1.415297232079579, "learning_rate": 2.8665786747993648e-05, "loss": 0.2151, "step": 4755 }, { "epoch": 0.3702788185489757, "grad_norm": 1.471581723402147, "learning_rate": 2.866128758927586e-05, "loss": 0.1972, "step": 4756 }, { "epoch": 0.37035667364118535, "grad_norm": 1.4563463898981026, "learning_rate": 2.8656787891000565e-05, "loss": 0.2412, "step": 4757 }, { "epoch": 0.37043452873339494, "grad_norm": 1.363561892504552, "learning_rate": 2.8652287653448078e-05, "loss": 0.2102, "step": 4758 }, { "epoch": 0.3705123838256046, "grad_norm": 1.4250156208152212, "learning_rate": 2.864778687689874e-05, "loss": 0.2169, "step": 4759 }, { "epoch": 0.37059023891781423, "grad_norm": 1.4115712040505048, "learning_rate": 2.864328556163293e-05, "loss": 0.2225, "step": 4760 }, { "epoch": 0.3706680940100238, "grad_norm": 1.455884666918258, "learning_rate": 2.8638783707931057e-05, "loss": 0.2227, "step": 4761 }, { "epoch": 0.37074594910223346, "grad_norm": 1.444306295505487, "learning_rate": 2.863428131607356e-05, "loss": 0.2742, "step": 4762 }, { "epoch": 0.3708238041944431, "grad_norm": 1.528015609589588, "learning_rate": 2.8629778386340917e-05, "loss": 0.2297, "step": 4763 }, { "epoch": 0.3709016592866527, "grad_norm": 1.4054147183490613, "learning_rate": 2.8625274919013652e-05, "loss": 0.2221, "step": 4764 }, { "epoch": 0.37097951437886234, "grad_norm": 1.3690906954235986, "learning_rate": 2.8620770914372304e-05, "loss": 0.2028, "step": 4765 }, { "epoch": 0.371057369471072, "grad_norm": 1.4708614324580007, "learning_rate": 2.861626637269744e-05, "loss": 0.2374, "step": 4766 }, { "epoch": 0.3711352245632816, "grad_norm": 1.5405155563004638, "learning_rate": 2.8611761294269694e-05, "loss": 0.2243, "step": 4767 }, { "epoch": 0.3712130796554912, "grad_norm": 1.4088529405192887, "learning_rate": 2.8607255679369684e-05, "loss": 0.2344, "step": 4768 }, { "epoch": 0.37129093474770086, "grad_norm": 1.4747263698257072, "learning_rate": 2.8602749528278112e-05, "loss": 0.2277, "step": 4769 }, { "epoch": 0.37136878983991045, "grad_norm": 1.4961228140492975, "learning_rate": 2.8598242841275676e-05, "loss": 0.1977, "step": 4770 }, { "epoch": 0.3714466449321201, "grad_norm": 1.490803044800362, "learning_rate": 2.8593735618643133e-05, "loss": 0.2076, "step": 4771 }, { "epoch": 0.37152450002432974, "grad_norm": 1.4921867991514512, "learning_rate": 2.858922786066125e-05, "loss": 0.2335, "step": 4772 }, { "epoch": 0.37160235511653933, "grad_norm": 1.3170299946290756, "learning_rate": 2.8584719567610845e-05, "loss": 0.2105, "step": 4773 }, { "epoch": 0.371680210208749, "grad_norm": 1.5032560046594872, "learning_rate": 2.858021073977276e-05, "loss": 0.2243, "step": 4774 }, { "epoch": 0.3717580653009586, "grad_norm": 1.6910104685505007, "learning_rate": 2.8575701377427884e-05, "loss": 0.2595, "step": 4775 }, { "epoch": 0.3718359203931682, "grad_norm": 1.4175864805668925, "learning_rate": 2.8571191480857115e-05, "loss": 0.2198, "step": 4776 }, { "epoch": 0.37191377548537785, "grad_norm": 1.438406988113545, "learning_rate": 2.8566681050341408e-05, "loss": 0.2186, "step": 4777 }, { "epoch": 0.37199163057758744, "grad_norm": 1.5174707210075733, "learning_rate": 2.8562170086161737e-05, "loss": 0.2515, "step": 4778 }, { "epoch": 0.3720694856697971, "grad_norm": 1.5293384714995968, "learning_rate": 2.8557658588599113e-05, "loss": 0.2743, "step": 4779 }, { "epoch": 0.37214734076200673, "grad_norm": 1.5018149409648578, "learning_rate": 2.8553146557934587e-05, "loss": 0.2385, "step": 4780 }, { "epoch": 0.3722251958542163, "grad_norm": 1.5206882737394278, "learning_rate": 2.854863399444923e-05, "loss": 0.2762, "step": 4781 }, { "epoch": 0.37230305094642596, "grad_norm": 1.3722890401866368, "learning_rate": 2.8544120898424162e-05, "loss": 0.2038, "step": 4782 }, { "epoch": 0.3723809060386356, "grad_norm": 1.532091058857584, "learning_rate": 2.853960727014052e-05, "loss": 0.263, "step": 4783 }, { "epoch": 0.3724587611308452, "grad_norm": 1.529477802128067, "learning_rate": 2.8535093109879487e-05, "loss": 0.2554, "step": 4784 }, { "epoch": 0.37253661622305484, "grad_norm": 1.4901783060027596, "learning_rate": 2.8530578417922276e-05, "loss": 0.2075, "step": 4785 }, { "epoch": 0.3726144713152645, "grad_norm": 1.4623745076474275, "learning_rate": 2.852606319455012e-05, "loss": 0.2666, "step": 4786 }, { "epoch": 0.3726923264074741, "grad_norm": 1.6172004126601935, "learning_rate": 2.85215474400443e-05, "loss": 0.2548, "step": 4787 }, { "epoch": 0.3727701814996837, "grad_norm": 1.5161768072069928, "learning_rate": 2.8517031154686136e-05, "loss": 0.2432, "step": 4788 }, { "epoch": 0.37284803659189336, "grad_norm": 1.4484028003363574, "learning_rate": 2.851251433875696e-05, "loss": 0.2459, "step": 4789 }, { "epoch": 0.37292589168410295, "grad_norm": 1.4293464170045707, "learning_rate": 2.850799699253816e-05, "loss": 0.2015, "step": 4790 }, { "epoch": 0.3730037467763126, "grad_norm": 1.5340067660804872, "learning_rate": 2.8503479116311134e-05, "loss": 0.2555, "step": 4791 }, { "epoch": 0.37308160186852224, "grad_norm": 1.56467385040416, "learning_rate": 2.8498960710357324e-05, "loss": 0.2557, "step": 4792 }, { "epoch": 0.37315945696073183, "grad_norm": 1.3532031352463243, "learning_rate": 2.849444177495822e-05, "loss": 0.1755, "step": 4793 }, { "epoch": 0.3732373120529415, "grad_norm": 1.3243994281541325, "learning_rate": 2.848992231039531e-05, "loss": 0.1995, "step": 4794 }, { "epoch": 0.37331516714515106, "grad_norm": 1.3813979908719498, "learning_rate": 2.8485402316950156e-05, "loss": 0.2065, "step": 4795 }, { "epoch": 0.3733930222373607, "grad_norm": 1.6203235853932845, "learning_rate": 2.8480881794904322e-05, "loss": 0.2301, "step": 4796 }, { "epoch": 0.37347087732957035, "grad_norm": 1.5552851642819379, "learning_rate": 2.8476360744539407e-05, "loss": 0.2373, "step": 4797 }, { "epoch": 0.37354873242177994, "grad_norm": 1.4375220894250391, "learning_rate": 2.8471839166137067e-05, "loss": 0.2411, "step": 4798 }, { "epoch": 0.3736265875139896, "grad_norm": 1.526577592977373, "learning_rate": 2.846731705997897e-05, "loss": 0.2364, "step": 4799 }, { "epoch": 0.3737044426061992, "grad_norm": 1.3554829303625555, "learning_rate": 2.8462794426346815e-05, "loss": 0.2063, "step": 4800 }, { "epoch": 0.3737044426061992, "eval_loss": 0.0292804092168808, "eval_runtime": 162.6242, "eval_samples_per_second": 17.71, "eval_steps_per_second": 0.633, "step": 4800 }, { "epoch": 0.3737822976984088, "grad_norm": 1.406006310383071, "learning_rate": 2.845827126552235e-05, "loss": 0.2128, "step": 4801 }, { "epoch": 0.37386015279061846, "grad_norm": 1.44684978199188, "learning_rate": 2.845374757778734e-05, "loss": 0.2101, "step": 4802 }, { "epoch": 0.3739380078828281, "grad_norm": 1.3811605250429329, "learning_rate": 2.844922336342359e-05, "loss": 0.2405, "step": 4803 }, { "epoch": 0.3740158629750377, "grad_norm": 1.3641768715225602, "learning_rate": 2.8444698622712944e-05, "loss": 0.2221, "step": 4804 }, { "epoch": 0.37409371806724734, "grad_norm": 1.4274360360808709, "learning_rate": 2.8440173355937263e-05, "loss": 0.2012, "step": 4805 }, { "epoch": 0.374171573159457, "grad_norm": 1.5958288798609044, "learning_rate": 2.8435647563378457e-05, "loss": 0.2343, "step": 4806 }, { "epoch": 0.37424942825166657, "grad_norm": 1.5195468553231506, "learning_rate": 2.8431121245318455e-05, "loss": 0.2691, "step": 4807 }, { "epoch": 0.3743272833438762, "grad_norm": 1.4815347839417052, "learning_rate": 2.8426594402039235e-05, "loss": 0.1944, "step": 4808 }, { "epoch": 0.37440513843608586, "grad_norm": 1.5440726733852534, "learning_rate": 2.8422067033822795e-05, "loss": 0.2651, "step": 4809 }, { "epoch": 0.37448299352829545, "grad_norm": 1.553288465261982, "learning_rate": 2.8417539140951155e-05, "loss": 0.2421, "step": 4810 }, { "epoch": 0.3745608486205051, "grad_norm": 1.4830453614499377, "learning_rate": 2.84130107237064e-05, "loss": 0.2391, "step": 4811 }, { "epoch": 0.3746387037127147, "grad_norm": 1.5265374239533973, "learning_rate": 2.8408481782370617e-05, "loss": 0.2668, "step": 4812 }, { "epoch": 0.3747165588049243, "grad_norm": 1.4683774700235461, "learning_rate": 2.8403952317225948e-05, "loss": 0.2205, "step": 4813 }, { "epoch": 0.37479441389713397, "grad_norm": 1.4657626420958392, "learning_rate": 2.8399422328554547e-05, "loss": 0.2434, "step": 4814 }, { "epoch": 0.37487226898934356, "grad_norm": 1.4020737477334264, "learning_rate": 2.8394891816638613e-05, "loss": 0.2167, "step": 4815 }, { "epoch": 0.3749501240815532, "grad_norm": 1.4685564986986968, "learning_rate": 2.8390360781760384e-05, "loss": 0.2102, "step": 4816 }, { "epoch": 0.37502797917376285, "grad_norm": 1.4349032124494423, "learning_rate": 2.8385829224202114e-05, "loss": 0.2378, "step": 4817 }, { "epoch": 0.37510583426597244, "grad_norm": 1.592861231694396, "learning_rate": 2.8381297144246107e-05, "loss": 0.2103, "step": 4818 }, { "epoch": 0.3751836893581821, "grad_norm": 1.5643284926570282, "learning_rate": 2.8376764542174674e-05, "loss": 0.2357, "step": 4819 }, { "epoch": 0.3752615444503917, "grad_norm": 2.0362767196372733, "learning_rate": 2.8372231418270186e-05, "loss": 0.2648, "step": 4820 }, { "epoch": 0.3753393995426013, "grad_norm": 1.4445095714679295, "learning_rate": 2.8367697772815037e-05, "loss": 0.2423, "step": 4821 }, { "epoch": 0.37541725463481096, "grad_norm": 1.5558321795782788, "learning_rate": 2.836316360609165e-05, "loss": 0.2334, "step": 4822 }, { "epoch": 0.3754951097270206, "grad_norm": 1.4334339285612858, "learning_rate": 2.8358628918382476e-05, "loss": 0.2522, "step": 4823 }, { "epoch": 0.3755729648192302, "grad_norm": 1.381305746772037, "learning_rate": 2.8354093709970017e-05, "loss": 0.1995, "step": 4824 }, { "epoch": 0.37565081991143984, "grad_norm": 1.4152221491708006, "learning_rate": 2.8349557981136775e-05, "loss": 0.2333, "step": 4825 }, { "epoch": 0.3757286750036495, "grad_norm": 1.5124474032068052, "learning_rate": 2.8345021732165327e-05, "loss": 0.2314, "step": 4826 }, { "epoch": 0.37580653009585907, "grad_norm": 1.468959350380463, "learning_rate": 2.8340484963338245e-05, "loss": 0.2349, "step": 4827 }, { "epoch": 0.3758843851880687, "grad_norm": 1.4077376451697892, "learning_rate": 2.8335947674938156e-05, "loss": 0.2182, "step": 4828 }, { "epoch": 0.37596224028027836, "grad_norm": 1.6020455885464449, "learning_rate": 2.8331409867247713e-05, "loss": 0.2272, "step": 4829 }, { "epoch": 0.37604009537248795, "grad_norm": 1.4868211769379707, "learning_rate": 2.832687154054959e-05, "loss": 0.2258, "step": 4830 }, { "epoch": 0.3761179504646976, "grad_norm": 1.60371043486206, "learning_rate": 2.832233269512651e-05, "loss": 0.2396, "step": 4831 }, { "epoch": 0.3761958055569072, "grad_norm": 1.4401412684871553, "learning_rate": 2.8317793331261226e-05, "loss": 0.2026, "step": 4832 }, { "epoch": 0.3762736606491168, "grad_norm": 1.3766403078057299, "learning_rate": 2.8313253449236514e-05, "loss": 0.2002, "step": 4833 }, { "epoch": 0.37635151574132647, "grad_norm": 1.3683226968852789, "learning_rate": 2.8308713049335184e-05, "loss": 0.21, "step": 4834 }, { "epoch": 0.37642937083353606, "grad_norm": 1.3764609702890447, "learning_rate": 2.830417213184008e-05, "loss": 0.2296, "step": 4835 }, { "epoch": 0.3765072259257457, "grad_norm": 1.4687302329071081, "learning_rate": 2.8299630697034096e-05, "loss": 0.2279, "step": 4836 }, { "epoch": 0.37658508101795535, "grad_norm": 1.4812262374044198, "learning_rate": 2.8295088745200132e-05, "loss": 0.2183, "step": 4837 }, { "epoch": 0.37666293611016494, "grad_norm": 1.4045139944095089, "learning_rate": 2.8290546276621125e-05, "loss": 0.2547, "step": 4838 }, { "epoch": 0.3767407912023746, "grad_norm": 1.392697371671946, "learning_rate": 2.828600329158005e-05, "loss": 0.2288, "step": 4839 }, { "epoch": 0.3768186462945842, "grad_norm": 1.4930760239123424, "learning_rate": 2.8281459790359922e-05, "loss": 0.2547, "step": 4840 }, { "epoch": 0.3768965013867938, "grad_norm": 1.509365612262506, "learning_rate": 2.8276915773243775e-05, "loss": 0.2355, "step": 4841 }, { "epoch": 0.37697435647900346, "grad_norm": 1.6228057616189269, "learning_rate": 2.8272371240514688e-05, "loss": 0.2572, "step": 4842 }, { "epoch": 0.3770522115712131, "grad_norm": 1.472284247394024, "learning_rate": 2.8267826192455743e-05, "loss": 0.2118, "step": 4843 }, { "epoch": 0.3771300666634227, "grad_norm": 1.4215569218386548, "learning_rate": 2.8263280629350097e-05, "loss": 0.2188, "step": 4844 }, { "epoch": 0.37720792175563234, "grad_norm": 1.4450416309116518, "learning_rate": 2.8258734551480903e-05, "loss": 0.2524, "step": 4845 }, { "epoch": 0.377285776847842, "grad_norm": 1.435900927440838, "learning_rate": 2.8254187959131374e-05, "loss": 0.2138, "step": 4846 }, { "epoch": 0.37736363194005157, "grad_norm": 1.3180081576081564, "learning_rate": 2.824964085258473e-05, "loss": 0.2297, "step": 4847 }, { "epoch": 0.3774414870322612, "grad_norm": 1.4519445759277152, "learning_rate": 2.8245093232124232e-05, "loss": 0.2222, "step": 4848 }, { "epoch": 0.3775193421244708, "grad_norm": 1.3615697734410694, "learning_rate": 2.8240545098033182e-05, "loss": 0.2246, "step": 4849 }, { "epoch": 0.37759719721668045, "grad_norm": 1.3949594211865621, "learning_rate": 2.8235996450594912e-05, "loss": 0.1854, "step": 4850 }, { "epoch": 0.37759719721668045, "eval_loss": 0.028127091005444527, "eval_runtime": 162.8229, "eval_samples_per_second": 17.688, "eval_steps_per_second": 0.633, "step": 4850 }, { "epoch": 0.3776750523088901, "grad_norm": 1.3855675918182384, "learning_rate": 2.8231447290092773e-05, "loss": 0.2038, "step": 4851 }, { "epoch": 0.3777529074010997, "grad_norm": 1.4407546646575196, "learning_rate": 2.822689761681016e-05, "loss": 0.2465, "step": 4852 }, { "epoch": 0.3778307624933093, "grad_norm": 1.4083600222365376, "learning_rate": 2.822234743103049e-05, "loss": 0.2101, "step": 4853 }, { "epoch": 0.37790861758551897, "grad_norm": 1.4652498771080458, "learning_rate": 2.821779673303723e-05, "loss": 0.215, "step": 4854 }, { "epoch": 0.37798647267772856, "grad_norm": 1.3236185912017282, "learning_rate": 2.821324552311386e-05, "loss": 0.2057, "step": 4855 }, { "epoch": 0.3780643277699382, "grad_norm": 1.4660111409820464, "learning_rate": 2.8208693801543897e-05, "loss": 0.2052, "step": 4856 }, { "epoch": 0.37814218286214785, "grad_norm": 1.3880509133198577, "learning_rate": 2.8204141568610895e-05, "loss": 0.243, "step": 4857 }, { "epoch": 0.37822003795435744, "grad_norm": 1.507803602150047, "learning_rate": 2.8199588824598433e-05, "loss": 0.206, "step": 4858 }, { "epoch": 0.3782978930465671, "grad_norm": 1.5241433104083477, "learning_rate": 2.8195035569790135e-05, "loss": 0.2416, "step": 4859 }, { "epoch": 0.3783757481387767, "grad_norm": 1.4542198311185588, "learning_rate": 2.819048180446964e-05, "loss": 0.2258, "step": 4860 }, { "epoch": 0.3784536032309863, "grad_norm": 1.3733365101791315, "learning_rate": 2.8185927528920625e-05, "loss": 0.2198, "step": 4861 }, { "epoch": 0.37853145832319596, "grad_norm": 1.5051156773777576, "learning_rate": 2.8181372743426805e-05, "loss": 0.2144, "step": 4862 }, { "epoch": 0.3786093134154056, "grad_norm": 1.3622040914608997, "learning_rate": 2.817681744827191e-05, "loss": 0.2215, "step": 4863 }, { "epoch": 0.3786871685076152, "grad_norm": 1.379659489410257, "learning_rate": 2.8172261643739732e-05, "loss": 0.2017, "step": 4864 }, { "epoch": 0.37876502359982483, "grad_norm": 1.4282148577107507, "learning_rate": 2.8167705330114064e-05, "loss": 0.2266, "step": 4865 }, { "epoch": 0.3788428786920344, "grad_norm": 1.6288088591610872, "learning_rate": 2.8163148507678744e-05, "loss": 0.2671, "step": 4866 }, { "epoch": 0.37892073378424407, "grad_norm": 1.5347993525326387, "learning_rate": 2.8158591176717646e-05, "loss": 0.2288, "step": 4867 }, { "epoch": 0.3789985888764537, "grad_norm": 1.4613776448866287, "learning_rate": 2.8154033337514666e-05, "loss": 0.2356, "step": 4868 }, { "epoch": 0.3790764439686633, "grad_norm": 1.5410334261430276, "learning_rate": 2.8149474990353728e-05, "loss": 0.2504, "step": 4869 }, { "epoch": 0.37915429906087295, "grad_norm": 1.4147126009586246, "learning_rate": 2.814491613551881e-05, "loss": 0.2255, "step": 4870 }, { "epoch": 0.3792321541530826, "grad_norm": 1.369125808148331, "learning_rate": 2.8140356773293903e-05, "loss": 0.2066, "step": 4871 }, { "epoch": 0.3793100092452922, "grad_norm": 1.5010633021518203, "learning_rate": 2.8135796903963033e-05, "loss": 0.2311, "step": 4872 }, { "epoch": 0.3793878643375018, "grad_norm": 1.436579891053444, "learning_rate": 2.8131236527810256e-05, "loss": 0.2003, "step": 4873 }, { "epoch": 0.37946571942971147, "grad_norm": 1.4860753552373562, "learning_rate": 2.8126675645119657e-05, "loss": 0.2129, "step": 4874 }, { "epoch": 0.37954357452192106, "grad_norm": 1.468581219771734, "learning_rate": 2.8122114256175375e-05, "loss": 0.2284, "step": 4875 }, { "epoch": 0.3796214296141307, "grad_norm": 1.532200857679202, "learning_rate": 2.8117552361261543e-05, "loss": 0.2612, "step": 4876 }, { "epoch": 0.37969928470634035, "grad_norm": 1.53484840242862, "learning_rate": 2.811298996066236e-05, "loss": 0.2352, "step": 4877 }, { "epoch": 0.37977713979854993, "grad_norm": 1.6878130263591888, "learning_rate": 2.8108427054662035e-05, "loss": 0.2267, "step": 4878 }, { "epoch": 0.3798549948907596, "grad_norm": 1.4797453190327217, "learning_rate": 2.8103863643544814e-05, "loss": 0.202, "step": 4879 }, { "epoch": 0.3799328499829692, "grad_norm": 1.4478091733427148, "learning_rate": 2.8099299727594987e-05, "loss": 0.2467, "step": 4880 }, { "epoch": 0.3800107050751788, "grad_norm": 1.373064178711986, "learning_rate": 2.809473530709685e-05, "loss": 0.2263, "step": 4881 }, { "epoch": 0.38008856016738846, "grad_norm": 1.3906085309631715, "learning_rate": 2.809017038233476e-05, "loss": 0.2273, "step": 4882 }, { "epoch": 0.3801664152595981, "grad_norm": 1.3794250163370783, "learning_rate": 2.8085604953593075e-05, "loss": 0.2299, "step": 4883 }, { "epoch": 0.3802442703518077, "grad_norm": 1.5916801433443728, "learning_rate": 2.808103902115621e-05, "loss": 0.234, "step": 4884 }, { "epoch": 0.38032212544401733, "grad_norm": 2.852597957961319, "learning_rate": 2.8076472585308604e-05, "loss": 0.2363, "step": 4885 }, { "epoch": 0.3803999805362269, "grad_norm": 1.4094716437451464, "learning_rate": 2.8071905646334713e-05, "loss": 0.2211, "step": 4886 }, { "epoch": 0.38047783562843657, "grad_norm": 1.6735409697571932, "learning_rate": 2.8067338204519035e-05, "loss": 0.2835, "step": 4887 }, { "epoch": 0.3805556907206462, "grad_norm": 1.3979577399457086, "learning_rate": 2.806277026014612e-05, "loss": 0.2376, "step": 4888 }, { "epoch": 0.3806335458128558, "grad_norm": 1.407890101656086, "learning_rate": 2.8058201813500507e-05, "loss": 0.2421, "step": 4889 }, { "epoch": 0.38071140090506544, "grad_norm": 1.4242352151276463, "learning_rate": 2.80536328648668e-05, "loss": 0.2091, "step": 4890 }, { "epoch": 0.3807892559972751, "grad_norm": 1.4718359844330737, "learning_rate": 2.804906341452962e-05, "loss": 0.2455, "step": 4891 }, { "epoch": 0.3808671110894847, "grad_norm": 1.3821756102982445, "learning_rate": 2.804449346277362e-05, "loss": 0.2134, "step": 4892 }, { "epoch": 0.3809449661816943, "grad_norm": 1.243499219317821, "learning_rate": 2.8039923009883497e-05, "loss": 0.2015, "step": 4893 }, { "epoch": 0.38102282127390397, "grad_norm": 1.338357726926968, "learning_rate": 2.803535205614396e-05, "loss": 0.233, "step": 4894 }, { "epoch": 0.38110067636611356, "grad_norm": 1.5258348005132536, "learning_rate": 2.8030780601839757e-05, "loss": 0.2796, "step": 4895 }, { "epoch": 0.3811785314583232, "grad_norm": 1.4236664935414283, "learning_rate": 2.802620864725567e-05, "loss": 0.2024, "step": 4896 }, { "epoch": 0.38125638655053284, "grad_norm": 1.522404294905766, "learning_rate": 2.802163619267651e-05, "loss": 0.2172, "step": 4897 }, { "epoch": 0.38133424164274243, "grad_norm": 1.5950622566934007, "learning_rate": 2.801706323838712e-05, "loss": 0.2754, "step": 4898 }, { "epoch": 0.3814120967349521, "grad_norm": 1.3879150685155779, "learning_rate": 2.8012489784672382e-05, "loss": 0.2021, "step": 4899 }, { "epoch": 0.3814899518271617, "grad_norm": 1.3928456760661276, "learning_rate": 2.8007915831817184e-05, "loss": 0.2294, "step": 4900 }, { "epoch": 0.3814899518271617, "eval_loss": 0.027414266020059586, "eval_runtime": 162.8303, "eval_samples_per_second": 17.687, "eval_steps_per_second": 0.633, "step": 4900 }, { "epoch": 0.3815678069193713, "grad_norm": 1.4945748975793227, "learning_rate": 2.8003341380106474e-05, "loss": 0.2291, "step": 4901 }, { "epoch": 0.38164566201158096, "grad_norm": 1.4942378170825614, "learning_rate": 2.799876642982521e-05, "loss": 0.2234, "step": 4902 }, { "epoch": 0.38172351710379054, "grad_norm": 1.414348414457936, "learning_rate": 2.79941909812584e-05, "loss": 0.2417, "step": 4903 }, { "epoch": 0.3818013721960002, "grad_norm": 1.3204732634965468, "learning_rate": 2.798961503469107e-05, "loss": 0.1969, "step": 4904 }, { "epoch": 0.38187922728820983, "grad_norm": 1.382440914417873, "learning_rate": 2.7985038590408274e-05, "loss": 0.2049, "step": 4905 }, { "epoch": 0.3819570823804194, "grad_norm": 1.455071640521309, "learning_rate": 2.7980461648695105e-05, "loss": 0.2052, "step": 4906 }, { "epoch": 0.38203493747262907, "grad_norm": 1.395945884955009, "learning_rate": 2.797588420983669e-05, "loss": 0.2116, "step": 4907 }, { "epoch": 0.3821127925648387, "grad_norm": 1.4528490196046162, "learning_rate": 2.797130627411818e-05, "loss": 0.2013, "step": 4908 }, { "epoch": 0.3821906476570483, "grad_norm": 1.4156299915237465, "learning_rate": 2.7966727841824757e-05, "loss": 0.2024, "step": 4909 }, { "epoch": 0.38226850274925794, "grad_norm": 1.4510764167796701, "learning_rate": 2.7962148913241632e-05, "loss": 0.1842, "step": 4910 }, { "epoch": 0.3823463578414676, "grad_norm": 1.470788559695645, "learning_rate": 2.795756948865406e-05, "loss": 0.2271, "step": 4911 }, { "epoch": 0.3824242129336772, "grad_norm": 1.3366879354063637, "learning_rate": 2.7952989568347305e-05, "loss": 0.2136, "step": 4912 }, { "epoch": 0.3825020680258868, "grad_norm": 1.3752841549719443, "learning_rate": 2.794840915260669e-05, "loss": 0.2104, "step": 4913 }, { "epoch": 0.38257992311809647, "grad_norm": 1.497918660071964, "learning_rate": 2.7943828241717546e-05, "loss": 0.2229, "step": 4914 }, { "epoch": 0.38265777821030605, "grad_norm": 1.497663315206761, "learning_rate": 2.7939246835965236e-05, "loss": 0.2076, "step": 4915 }, { "epoch": 0.3827356333025157, "grad_norm": 1.587494076239407, "learning_rate": 2.793466493563517e-05, "loss": 0.25, "step": 4916 }, { "epoch": 0.38281348839472534, "grad_norm": 1.4675458661520646, "learning_rate": 2.793008254101277e-05, "loss": 0.2, "step": 4917 }, { "epoch": 0.38289134348693493, "grad_norm": 1.5343847760608673, "learning_rate": 2.7925499652383508e-05, "loss": 0.2302, "step": 4918 }, { "epoch": 0.3829691985791446, "grad_norm": 1.6266762557773282, "learning_rate": 2.7920916270032873e-05, "loss": 0.2236, "step": 4919 }, { "epoch": 0.38304705367135417, "grad_norm": 1.4085726349353764, "learning_rate": 2.7916332394246383e-05, "loss": 0.2187, "step": 4920 }, { "epoch": 0.3831249087635638, "grad_norm": 1.4316828223890856, "learning_rate": 2.7911748025309592e-05, "loss": 0.2283, "step": 4921 }, { "epoch": 0.38320276385577345, "grad_norm": 1.5056635026280452, "learning_rate": 2.790716316350809e-05, "loss": 0.2038, "step": 4922 }, { "epoch": 0.38328061894798304, "grad_norm": 1.4797717733260274, "learning_rate": 2.7902577809127488e-05, "loss": 0.2284, "step": 4923 }, { "epoch": 0.3833584740401927, "grad_norm": 1.3730528971837157, "learning_rate": 2.7897991962453436e-05, "loss": 0.2232, "step": 4924 }, { "epoch": 0.38343632913240233, "grad_norm": 1.3587871478243325, "learning_rate": 2.7893405623771603e-05, "loss": 0.2298, "step": 4925 }, { "epoch": 0.3835141842246119, "grad_norm": 1.468618755522899, "learning_rate": 2.788881879336771e-05, "loss": 0.2296, "step": 4926 }, { "epoch": 0.38359203931682156, "grad_norm": 1.3949036637487795, "learning_rate": 2.7884231471527485e-05, "loss": 0.2071, "step": 4927 }, { "epoch": 0.3836698944090312, "grad_norm": 1.4800312564946758, "learning_rate": 2.78796436585367e-05, "loss": 0.2438, "step": 4928 }, { "epoch": 0.3837477495012408, "grad_norm": 1.5041341536463473, "learning_rate": 2.787505535468115e-05, "loss": 0.2303, "step": 4929 }, { "epoch": 0.38382560459345044, "grad_norm": 1.3397423528728625, "learning_rate": 2.787046656024666e-05, "loss": 0.2292, "step": 4930 }, { "epoch": 0.3839034596856601, "grad_norm": 1.3467592349143103, "learning_rate": 2.786587727551911e-05, "loss": 0.2175, "step": 4931 }, { "epoch": 0.3839813147778697, "grad_norm": 1.3830014147895977, "learning_rate": 2.7861287500784372e-05, "loss": 0.1903, "step": 4932 }, { "epoch": 0.3840591698700793, "grad_norm": 1.3568387609446433, "learning_rate": 2.7856697236328378e-05, "loss": 0.2161, "step": 4933 }, { "epoch": 0.38413702496228896, "grad_norm": 1.6129400270413075, "learning_rate": 2.785210648243707e-05, "loss": 0.2462, "step": 4934 }, { "epoch": 0.38421488005449855, "grad_norm": 1.5444742167591163, "learning_rate": 2.784751523939644e-05, "loss": 0.2301, "step": 4935 }, { "epoch": 0.3842927351467082, "grad_norm": 1.4535078856882506, "learning_rate": 2.7842923507492496e-05, "loss": 0.2265, "step": 4936 }, { "epoch": 0.38437059023891784, "grad_norm": 1.4409502310351483, "learning_rate": 2.7838331287011285e-05, "loss": 0.2151, "step": 4937 }, { "epoch": 0.38444844533112743, "grad_norm": 1.4763280282003046, "learning_rate": 2.7833738578238878e-05, "loss": 0.2268, "step": 4938 }, { "epoch": 0.3845263004233371, "grad_norm": 1.436311429328908, "learning_rate": 2.7829145381461377e-05, "loss": 0.2175, "step": 4939 }, { "epoch": 0.38460415551554666, "grad_norm": 1.4741518589197544, "learning_rate": 2.782455169696492e-05, "loss": 0.249, "step": 4940 }, { "epoch": 0.3846820106077563, "grad_norm": 1.4239829521066547, "learning_rate": 2.781995752503567e-05, "loss": 0.202, "step": 4941 }, { "epoch": 0.38475986569996595, "grad_norm": 1.4285453305302416, "learning_rate": 2.7815362865959828e-05, "loss": 0.2099, "step": 4942 }, { "epoch": 0.38483772079217554, "grad_norm": 1.398687761030148, "learning_rate": 2.7810767720023607e-05, "loss": 0.2057, "step": 4943 }, { "epoch": 0.3849155758843852, "grad_norm": 1.4224720575025145, "learning_rate": 2.7806172087513274e-05, "loss": 0.2057, "step": 4944 }, { "epoch": 0.38499343097659483, "grad_norm": 1.4429655189249249, "learning_rate": 2.7801575968715115e-05, "loss": 0.2144, "step": 4945 }, { "epoch": 0.3850712860688044, "grad_norm": 1.332618548835128, "learning_rate": 2.7796979363915438e-05, "loss": 0.2229, "step": 4946 }, { "epoch": 0.38514914116101406, "grad_norm": 1.3664458226737415, "learning_rate": 2.7792382273400608e-05, "loss": 0.213, "step": 4947 }, { "epoch": 0.3852269962532237, "grad_norm": 1.3564552181091603, "learning_rate": 2.7787784697456976e-05, "loss": 0.205, "step": 4948 }, { "epoch": 0.3853048513454333, "grad_norm": 1.3637830039002516, "learning_rate": 2.778318663637097e-05, "loss": 0.2058, "step": 4949 }, { "epoch": 0.38538270643764294, "grad_norm": 1.2815973372546183, "learning_rate": 2.777858809042902e-05, "loss": 0.1923, "step": 4950 }, { "epoch": 0.38538270643764294, "eval_loss": 0.02711302787065506, "eval_runtime": 162.6795, "eval_samples_per_second": 17.704, "eval_steps_per_second": 0.633, "step": 4950 }, { "epoch": 0.3854605615298526, "grad_norm": 1.3920361706822875, "learning_rate": 2.7773989059917595e-05, "loss": 0.2073, "step": 4951 }, { "epoch": 0.3855384166220622, "grad_norm": 1.292414956766584, "learning_rate": 2.776938954512319e-05, "loss": 0.2105, "step": 4952 }, { "epoch": 0.3856162717142718, "grad_norm": 1.4163720929043353, "learning_rate": 2.7764789546332336e-05, "loss": 0.2171, "step": 4953 }, { "epoch": 0.38569412680648146, "grad_norm": 1.3422650454415228, "learning_rate": 2.7760189063831592e-05, "loss": 0.2074, "step": 4954 }, { "epoch": 0.38577198189869105, "grad_norm": 1.5224247527469121, "learning_rate": 2.775558809790755e-05, "loss": 0.2233, "step": 4955 }, { "epoch": 0.3858498369909007, "grad_norm": 1.5273595692481403, "learning_rate": 2.7750986648846815e-05, "loss": 0.2172, "step": 4956 }, { "epoch": 0.3859276920831103, "grad_norm": 1.3652655014519735, "learning_rate": 2.774638471693605e-05, "loss": 0.1755, "step": 4957 }, { "epoch": 0.38600554717531993, "grad_norm": 1.3168116662851566, "learning_rate": 2.7741782302461927e-05, "loss": 0.1884, "step": 4958 }, { "epoch": 0.3860834022675296, "grad_norm": 1.449443157097701, "learning_rate": 2.773717940571116e-05, "loss": 0.2431, "step": 4959 }, { "epoch": 0.38616125735973916, "grad_norm": 1.3625679891451128, "learning_rate": 2.7732576026970478e-05, "loss": 0.2086, "step": 4960 }, { "epoch": 0.3862391124519488, "grad_norm": 1.4055504311719957, "learning_rate": 2.772797216652666e-05, "loss": 0.1889, "step": 4961 }, { "epoch": 0.38631696754415845, "grad_norm": 1.4311730876532993, "learning_rate": 2.77233678246665e-05, "loss": 0.2365, "step": 4962 }, { "epoch": 0.38639482263636804, "grad_norm": 1.3652375885510304, "learning_rate": 2.771876300167683e-05, "loss": 0.2043, "step": 4963 }, { "epoch": 0.3864726777285777, "grad_norm": 1.3867177238704422, "learning_rate": 2.7714157697844494e-05, "loss": 0.2037, "step": 4964 }, { "epoch": 0.38655053282078733, "grad_norm": 1.5156687143310221, "learning_rate": 2.7709551913456406e-05, "loss": 0.1942, "step": 4965 }, { "epoch": 0.3866283879129969, "grad_norm": 1.564747480701733, "learning_rate": 2.7704945648799468e-05, "loss": 0.2096, "step": 4966 }, { "epoch": 0.38670624300520656, "grad_norm": 1.495943782070293, "learning_rate": 2.7700338904160634e-05, "loss": 0.2462, "step": 4967 }, { "epoch": 0.3867840980974162, "grad_norm": 1.3720781729423939, "learning_rate": 2.769573167982688e-05, "loss": 0.2064, "step": 4968 }, { "epoch": 0.3868619531896258, "grad_norm": 1.5222719342981337, "learning_rate": 2.7691123976085213e-05, "loss": 0.2366, "step": 4969 }, { "epoch": 0.38693980828183544, "grad_norm": 1.5277003493402572, "learning_rate": 2.7686515793222673e-05, "loss": 0.2305, "step": 4970 }, { "epoch": 0.3870176633740451, "grad_norm": 1.2834867218476254, "learning_rate": 2.7681907131526337e-05, "loss": 0.177, "step": 4971 }, { "epoch": 0.3870955184662547, "grad_norm": 1.4381314976220443, "learning_rate": 2.767729799128329e-05, "loss": 0.1997, "step": 4972 }, { "epoch": 0.3871733735584643, "grad_norm": 1.3220633282956926, "learning_rate": 2.7672688372780664e-05, "loss": 0.1863, "step": 4973 }, { "epoch": 0.3872512286506739, "grad_norm": 1.382133021402589, "learning_rate": 2.7668078276305617e-05, "loss": 0.2045, "step": 4974 }, { "epoch": 0.38732908374288355, "grad_norm": 1.5417656249778158, "learning_rate": 2.7663467702145343e-05, "loss": 0.2397, "step": 4975 }, { "epoch": 0.3874069388350932, "grad_norm": 1.4093831093076186, "learning_rate": 2.7658856650587043e-05, "loss": 0.2278, "step": 4976 }, { "epoch": 0.3874847939273028, "grad_norm": 1.394156476832959, "learning_rate": 2.7654245121917975e-05, "loss": 0.2111, "step": 4977 }, { "epoch": 0.38756264901951243, "grad_norm": 1.3872331148532282, "learning_rate": 2.7649633116425414e-05, "loss": 0.2097, "step": 4978 }, { "epoch": 0.3876405041117221, "grad_norm": 1.3446499773000848, "learning_rate": 2.7645020634396668e-05, "loss": 0.1833, "step": 4979 }, { "epoch": 0.38771835920393166, "grad_norm": 1.3874492164431151, "learning_rate": 2.7640407676119076e-05, "loss": 0.2153, "step": 4980 }, { "epoch": 0.3877962142961413, "grad_norm": 1.4002562044254339, "learning_rate": 2.7635794241879993e-05, "loss": 0.1907, "step": 4981 }, { "epoch": 0.38787406938835095, "grad_norm": 1.4263004865645055, "learning_rate": 2.7631180331966824e-05, "loss": 0.238, "step": 4982 }, { "epoch": 0.38795192448056054, "grad_norm": 1.3573059192581671, "learning_rate": 2.7626565946666985e-05, "loss": 0.2054, "step": 4983 }, { "epoch": 0.3880297795727702, "grad_norm": 1.3273523098821427, "learning_rate": 2.762195108626794e-05, "loss": 0.2035, "step": 4984 }, { "epoch": 0.38810763466497983, "grad_norm": 1.491914771315591, "learning_rate": 2.7617335751057172e-05, "loss": 0.2172, "step": 4985 }, { "epoch": 0.3881854897571894, "grad_norm": 1.4857923776554964, "learning_rate": 2.7612719941322184e-05, "loss": 0.2016, "step": 4986 }, { "epoch": 0.38826334484939906, "grad_norm": 1.449519054806515, "learning_rate": 2.7608103657350526e-05, "loss": 0.219, "step": 4987 }, { "epoch": 0.3883411999416087, "grad_norm": 1.3801815998091673, "learning_rate": 2.760348689942977e-05, "loss": 0.1987, "step": 4988 }, { "epoch": 0.3884190550338183, "grad_norm": 1.4313415446759319, "learning_rate": 2.7598869667847528e-05, "loss": 0.2047, "step": 4989 }, { "epoch": 0.38849691012602794, "grad_norm": 1.4449558692941076, "learning_rate": 2.7594251962891422e-05, "loss": 0.226, "step": 4990 }, { "epoch": 0.38857476521823753, "grad_norm": 1.4053509522770864, "learning_rate": 2.7589633784849112e-05, "loss": 0.1964, "step": 4991 }, { "epoch": 0.38865262031044717, "grad_norm": 1.3554829854321382, "learning_rate": 2.7585015134008292e-05, "loss": 0.1857, "step": 4992 }, { "epoch": 0.3887304754026568, "grad_norm": 1.57081713197367, "learning_rate": 2.7580396010656684e-05, "loss": 0.2473, "step": 4993 }, { "epoch": 0.3888083304948664, "grad_norm": 1.3046898280145351, "learning_rate": 2.757577641508203e-05, "loss": 0.1899, "step": 4994 }, { "epoch": 0.38888618558707605, "grad_norm": 1.4911933347628925, "learning_rate": 2.757115634757212e-05, "loss": 0.2299, "step": 4995 }, { "epoch": 0.3889640406792857, "grad_norm": 1.4441301590030233, "learning_rate": 2.7566535808414757e-05, "loss": 0.2166, "step": 4996 }, { "epoch": 0.3890418957714953, "grad_norm": 1.5154098772636957, "learning_rate": 2.756191479789777e-05, "loss": 0.1959, "step": 4997 }, { "epoch": 0.3891197508637049, "grad_norm": 1.3961139392167656, "learning_rate": 2.7557293316309047e-05, "loss": 0.2219, "step": 4998 }, { "epoch": 0.38919760595591457, "grad_norm": 1.3690813749914583, "learning_rate": 2.7552671363936466e-05, "loss": 0.1985, "step": 4999 }, { "epoch": 0.38927546104812416, "grad_norm": 1.27647283437879, "learning_rate": 2.7548048941067962e-05, "loss": 0.2015, "step": 5000 }, { "epoch": 0.38927546104812416, "eval_loss": 0.026405027136206627, "eval_runtime": 163.1234, "eval_samples_per_second": 17.655, "eval_steps_per_second": 0.631, "step": 5000 }, { "epoch": 0.3893533161403338, "grad_norm": 1.5405165402210148, "learning_rate": 2.754342604799149e-05, "loss": 0.198, "step": 5001 }, { "epoch": 0.38943117123254345, "grad_norm": 1.356503904478101, "learning_rate": 2.753880268499503e-05, "loss": 0.1937, "step": 5002 }, { "epoch": 0.38950902632475304, "grad_norm": 1.3341629221403137, "learning_rate": 2.75341788523666e-05, "loss": 0.1968, "step": 5003 }, { "epoch": 0.3895868814169627, "grad_norm": 1.3980755551546886, "learning_rate": 2.7529554550394242e-05, "loss": 0.1807, "step": 5004 }, { "epoch": 0.3896647365091723, "grad_norm": 1.2995058187826343, "learning_rate": 2.7524929779366023e-05, "loss": 0.1601, "step": 5005 }, { "epoch": 0.3897425916013819, "grad_norm": 1.3805024331941431, "learning_rate": 2.752030453957005e-05, "loss": 0.2003, "step": 5006 }, { "epoch": 0.38982044669359156, "grad_norm": 1.4169932851724285, "learning_rate": 2.7515678831294453e-05, "loss": 0.1768, "step": 5007 }, { "epoch": 0.3898983017858012, "grad_norm": 1.4500254943933597, "learning_rate": 2.7511052654827396e-05, "loss": 0.2002, "step": 5008 }, { "epoch": 0.3899761568780108, "grad_norm": 1.3988631888832739, "learning_rate": 2.7506426010457065e-05, "loss": 0.2324, "step": 5009 }, { "epoch": 0.39005401197022044, "grad_norm": 1.3513855157269845, "learning_rate": 2.750179889847167e-05, "loss": 0.192, "step": 5010 }, { "epoch": 0.39013186706243, "grad_norm": 1.2982354030632077, "learning_rate": 2.749717131915947e-05, "loss": 0.1806, "step": 5011 }, { "epoch": 0.39020972215463967, "grad_norm": 1.3823015525673905, "learning_rate": 2.749254327280873e-05, "loss": 0.1964, "step": 5012 }, { "epoch": 0.3902875772468493, "grad_norm": 1.2847590134069446, "learning_rate": 2.748791475970777e-05, "loss": 0.1638, "step": 5013 }, { "epoch": 0.3903654323390589, "grad_norm": 1.5955967968770566, "learning_rate": 2.7483285780144914e-05, "loss": 0.2528, "step": 5014 }, { "epoch": 0.39044328743126855, "grad_norm": 1.4944359548262243, "learning_rate": 2.7478656334408524e-05, "loss": 0.2206, "step": 5015 }, { "epoch": 0.3905211425234782, "grad_norm": 1.298828965244944, "learning_rate": 2.7474026422786998e-05, "loss": 0.1947, "step": 5016 }, { "epoch": 0.3905989976156878, "grad_norm": 1.4842244009985819, "learning_rate": 2.746939604556876e-05, "loss": 0.2463, "step": 5017 }, { "epoch": 0.3906768527078974, "grad_norm": 1.4192753371741584, "learning_rate": 2.746476520304226e-05, "loss": 0.2148, "step": 5018 }, { "epoch": 0.39075470780010707, "grad_norm": 1.376221774942106, "learning_rate": 2.7460133895495976e-05, "loss": 0.2164, "step": 5019 }, { "epoch": 0.39083256289231666, "grad_norm": 1.3696708746793325, "learning_rate": 2.745550212321841e-05, "loss": 0.2164, "step": 5020 }, { "epoch": 0.3909104179845263, "grad_norm": 1.3812401245266508, "learning_rate": 2.745086988649811e-05, "loss": 0.2258, "step": 5021 }, { "epoch": 0.39098827307673595, "grad_norm": 1.3262416642522952, "learning_rate": 2.7446237185623634e-05, "loss": 0.2004, "step": 5022 }, { "epoch": 0.39106612816894554, "grad_norm": 1.5233794110817875, "learning_rate": 2.7441604020883583e-05, "loss": 0.2059, "step": 5023 }, { "epoch": 0.3911439832611552, "grad_norm": 1.424479453561015, "learning_rate": 2.7436970392566587e-05, "loss": 0.2051, "step": 5024 }, { "epoch": 0.3912218383533648, "grad_norm": 1.3869667702727235, "learning_rate": 2.7432336300961286e-05, "loss": 0.1958, "step": 5025 }, { "epoch": 0.3912996934455744, "grad_norm": 1.3493404624969323, "learning_rate": 2.7427701746356364e-05, "loss": 0.1916, "step": 5026 }, { "epoch": 0.39137754853778406, "grad_norm": 1.3229088709047372, "learning_rate": 2.7423066729040545e-05, "loss": 0.1721, "step": 5027 }, { "epoch": 0.39145540362999365, "grad_norm": 1.373251421613485, "learning_rate": 2.741843124930256e-05, "loss": 0.1934, "step": 5028 }, { "epoch": 0.3915332587222033, "grad_norm": 1.451985943410074, "learning_rate": 2.7413795307431175e-05, "loss": 0.2147, "step": 5029 }, { "epoch": 0.39161111381441294, "grad_norm": 1.4331177777122135, "learning_rate": 2.7409158903715183e-05, "loss": 0.2143, "step": 5030 }, { "epoch": 0.3916889689066225, "grad_norm": 1.4167611045562387, "learning_rate": 2.740452203844343e-05, "loss": 0.2116, "step": 5031 }, { "epoch": 0.39176682399883217, "grad_norm": 1.372161972223887, "learning_rate": 2.7399884711904752e-05, "loss": 0.1859, "step": 5032 }, { "epoch": 0.3918446790910418, "grad_norm": 1.4272138615005168, "learning_rate": 2.7395246924388042e-05, "loss": 0.2256, "step": 5033 }, { "epoch": 0.3919225341832514, "grad_norm": 1.31068633527325, "learning_rate": 2.7390608676182204e-05, "loss": 0.197, "step": 5034 }, { "epoch": 0.39200038927546105, "grad_norm": 1.3609214630029678, "learning_rate": 2.7385969967576188e-05, "loss": 0.2183, "step": 5035 }, { "epoch": 0.3920782443676707, "grad_norm": 1.3612783503946624, "learning_rate": 2.738133079885896e-05, "loss": 0.2063, "step": 5036 }, { "epoch": 0.3921560994598803, "grad_norm": 1.4047309785061268, "learning_rate": 2.737669117031952e-05, "loss": 0.2249, "step": 5037 }, { "epoch": 0.3922339545520899, "grad_norm": 1.4173147597112152, "learning_rate": 2.7372051082246893e-05, "loss": 0.2102, "step": 5038 }, { "epoch": 0.39231180964429957, "grad_norm": 1.3704905061725392, "learning_rate": 2.736741053493013e-05, "loss": 0.1788, "step": 5039 }, { "epoch": 0.39238966473650916, "grad_norm": 1.3937964828604366, "learning_rate": 2.7362769528658328e-05, "loss": 0.2019, "step": 5040 }, { "epoch": 0.3924675198287188, "grad_norm": 1.3725822299610406, "learning_rate": 2.735812806372059e-05, "loss": 0.208, "step": 5041 }, { "epoch": 0.39254537492092845, "grad_norm": 1.3623892193941427, "learning_rate": 2.7353486140406062e-05, "loss": 0.1865, "step": 5042 }, { "epoch": 0.39262323001313804, "grad_norm": 1.31881763219477, "learning_rate": 2.7348843759003905e-05, "loss": 0.1997, "step": 5043 }, { "epoch": 0.3927010851053477, "grad_norm": 1.312503851352756, "learning_rate": 2.734420091980333e-05, "loss": 0.2044, "step": 5044 }, { "epoch": 0.39277894019755727, "grad_norm": 1.4720433525635188, "learning_rate": 2.7339557623093557e-05, "loss": 0.1926, "step": 5045 }, { "epoch": 0.3928567952897669, "grad_norm": 1.3953815826615976, "learning_rate": 2.7334913869163838e-05, "loss": 0.2001, "step": 5046 }, { "epoch": 0.39293465038197656, "grad_norm": 1.4500947559931114, "learning_rate": 2.7330269658303473e-05, "loss": 0.1878, "step": 5047 }, { "epoch": 0.39301250547418615, "grad_norm": 1.3066150380377934, "learning_rate": 2.7325624990801754e-05, "loss": 0.1596, "step": 5048 }, { "epoch": 0.3930903605663958, "grad_norm": 1.3395904860897712, "learning_rate": 2.7320979866948038e-05, "loss": 0.2031, "step": 5049 }, { "epoch": 0.39316821565860544, "grad_norm": 1.5095663335337477, "learning_rate": 2.7316334287031688e-05, "loss": 0.1947, "step": 5050 }, { "epoch": 0.39316821565860544, "eval_loss": 0.02575027570128441, "eval_runtime": 162.6371, "eval_samples_per_second": 17.708, "eval_steps_per_second": 0.633, "step": 5050 }, { "epoch": 0.393246070750815, "grad_norm": 1.444947758477605, "learning_rate": 2.7311688251342098e-05, "loss": 0.2187, "step": 5051 }, { "epoch": 0.39332392584302467, "grad_norm": 1.417566460257311, "learning_rate": 2.73070417601687e-05, "loss": 0.1919, "step": 5052 }, { "epoch": 0.3934017809352343, "grad_norm": 1.3941656367516764, "learning_rate": 2.7302394813800943e-05, "loss": 0.2083, "step": 5053 }, { "epoch": 0.3934796360274439, "grad_norm": 1.3908155373368174, "learning_rate": 2.7297747412528318e-05, "loss": 0.2191, "step": 5054 }, { "epoch": 0.39355749111965355, "grad_norm": 1.3954158892647308, "learning_rate": 2.7293099556640327e-05, "loss": 0.2131, "step": 5055 }, { "epoch": 0.3936353462118632, "grad_norm": 1.4992688938196952, "learning_rate": 2.7288451246426517e-05, "loss": 0.1985, "step": 5056 }, { "epoch": 0.3937132013040728, "grad_norm": 1.5167219975395936, "learning_rate": 2.7283802482176453e-05, "loss": 0.2151, "step": 5057 }, { "epoch": 0.3937910563962824, "grad_norm": 1.428245859071685, "learning_rate": 2.7279153264179726e-05, "loss": 0.2061, "step": 5058 }, { "epoch": 0.39386891148849207, "grad_norm": 1.434888250150821, "learning_rate": 2.7274503592725975e-05, "loss": 0.2063, "step": 5059 }, { "epoch": 0.39394676658070166, "grad_norm": 1.5371360055520544, "learning_rate": 2.7269853468104836e-05, "loss": 0.1986, "step": 5060 }, { "epoch": 0.3940246216729113, "grad_norm": 1.6270194887202833, "learning_rate": 2.7265202890606002e-05, "loss": 0.2383, "step": 5061 }, { "epoch": 0.39410247676512095, "grad_norm": 1.3594474960438925, "learning_rate": 2.726055186051918e-05, "loss": 0.212, "step": 5062 }, { "epoch": 0.39418033185733053, "grad_norm": 1.5580562666931292, "learning_rate": 2.72559003781341e-05, "loss": 0.2357, "step": 5063 }, { "epoch": 0.3942581869495402, "grad_norm": 1.4778518824347213, "learning_rate": 2.725124844374053e-05, "loss": 0.2056, "step": 5064 }, { "epoch": 0.39433604204174977, "grad_norm": 1.3770797611010117, "learning_rate": 2.7246596057628266e-05, "loss": 0.171, "step": 5065 }, { "epoch": 0.3944138971339594, "grad_norm": 1.22722459301907, "learning_rate": 2.7241943220087134e-05, "loss": 0.1568, "step": 5066 }, { "epoch": 0.39449175222616906, "grad_norm": 1.2834569205259483, "learning_rate": 2.7237289931406977e-05, "loss": 0.1703, "step": 5067 }, { "epoch": 0.39456960731837865, "grad_norm": 1.4410962519202288, "learning_rate": 2.7232636191877678e-05, "loss": 0.1859, "step": 5068 }, { "epoch": 0.3946474624105883, "grad_norm": 1.4643675007882333, "learning_rate": 2.722798200178914e-05, "loss": 0.2286, "step": 5069 }, { "epoch": 0.39472531750279793, "grad_norm": 1.5601865737927494, "learning_rate": 2.7223327361431294e-05, "loss": 0.2106, "step": 5070 }, { "epoch": 0.3948031725950075, "grad_norm": 1.3984422997282588, "learning_rate": 2.7218672271094112e-05, "loss": 0.2002, "step": 5071 }, { "epoch": 0.39488102768721717, "grad_norm": 1.403823908560182, "learning_rate": 2.7214016731067575e-05, "loss": 0.1968, "step": 5072 }, { "epoch": 0.3949588827794268, "grad_norm": 1.562324195243509, "learning_rate": 2.72093607416417e-05, "loss": 0.259, "step": 5073 }, { "epoch": 0.3950367378716364, "grad_norm": 1.4943845427036262, "learning_rate": 2.720470430310654e-05, "loss": 0.2349, "step": 5074 }, { "epoch": 0.39511459296384605, "grad_norm": 1.4963075858482084, "learning_rate": 2.720004741575217e-05, "loss": 0.212, "step": 5075 }, { "epoch": 0.3951924480560557, "grad_norm": 1.3838820857524716, "learning_rate": 2.7195390079868688e-05, "loss": 0.2267, "step": 5076 }, { "epoch": 0.3952703031482653, "grad_norm": 1.3118407494271673, "learning_rate": 2.719073229574623e-05, "loss": 0.1824, "step": 5077 }, { "epoch": 0.3953481582404749, "grad_norm": 1.2739766249905289, "learning_rate": 2.7186074063674946e-05, "loss": 0.1853, "step": 5078 }, { "epoch": 0.39542601333268457, "grad_norm": 1.4312060370345303, "learning_rate": 2.7181415383945017e-05, "loss": 0.2463, "step": 5079 }, { "epoch": 0.39550386842489416, "grad_norm": 1.4987826602296812, "learning_rate": 2.717675625684668e-05, "loss": 0.229, "step": 5080 }, { "epoch": 0.3955817235171038, "grad_norm": 1.3850119503798226, "learning_rate": 2.7172096682670157e-05, "loss": 0.1711, "step": 5081 }, { "epoch": 0.3956595786093134, "grad_norm": 1.3634150764575712, "learning_rate": 2.7167436661705715e-05, "loss": 0.2534, "step": 5082 }, { "epoch": 0.39573743370152303, "grad_norm": 1.429062754331154, "learning_rate": 2.7162776194243667e-05, "loss": 0.235, "step": 5083 }, { "epoch": 0.3958152887937327, "grad_norm": 1.3561428508017834, "learning_rate": 2.7158115280574322e-05, "loss": 0.1775, "step": 5084 }, { "epoch": 0.39589314388594227, "grad_norm": 1.3647975918819253, "learning_rate": 2.7153453920988053e-05, "loss": 0.1872, "step": 5085 }, { "epoch": 0.3959709989781519, "grad_norm": 1.4376155389228529, "learning_rate": 2.7148792115775223e-05, "loss": 0.1842, "step": 5086 }, { "epoch": 0.39604885407036156, "grad_norm": 1.4426177398839592, "learning_rate": 2.7144129865226242e-05, "loss": 0.2117, "step": 5087 }, { "epoch": 0.39612670916257114, "grad_norm": 1.3792431705254369, "learning_rate": 2.713946716963155e-05, "loss": 0.1995, "step": 5088 }, { "epoch": 0.3962045642547808, "grad_norm": 1.4161375557065492, "learning_rate": 2.7134804029281618e-05, "loss": 0.2196, "step": 5089 }, { "epoch": 0.39628241934699043, "grad_norm": 1.3908469827449668, "learning_rate": 2.713014044446693e-05, "loss": 0.1854, "step": 5090 }, { "epoch": 0.3963602744392, "grad_norm": 1.4256074474806708, "learning_rate": 2.7125476415478013e-05, "loss": 0.1874, "step": 5091 }, { "epoch": 0.39643812953140967, "grad_norm": 1.3065679168580668, "learning_rate": 2.71208119426054e-05, "loss": 0.229, "step": 5092 }, { "epoch": 0.3965159846236193, "grad_norm": 1.362092546834988, "learning_rate": 2.7116147026139672e-05, "loss": 0.1919, "step": 5093 }, { "epoch": 0.3965938397158289, "grad_norm": 1.3808045383849579, "learning_rate": 2.711148166637144e-05, "loss": 0.1901, "step": 5094 }, { "epoch": 0.39667169480803854, "grad_norm": 1.2978948972200812, "learning_rate": 2.7106815863591324e-05, "loss": 0.1822, "step": 5095 }, { "epoch": 0.3967495499002482, "grad_norm": 1.2514630534927276, "learning_rate": 2.710214961808998e-05, "loss": 0.1951, "step": 5096 }, { "epoch": 0.3968274049924578, "grad_norm": 1.4610131433825162, "learning_rate": 2.7097482930158105e-05, "loss": 0.2011, "step": 5097 }, { "epoch": 0.3969052600846674, "grad_norm": 1.3774856994424307, "learning_rate": 2.7092815800086397e-05, "loss": 0.2209, "step": 5098 }, { "epoch": 0.396983115176877, "grad_norm": 1.3946807104236174, "learning_rate": 2.708814822816561e-05, "loss": 0.1823, "step": 5099 }, { "epoch": 0.39706097026908665, "grad_norm": 1.3461003284970037, "learning_rate": 2.7083480214686505e-05, "loss": 0.2147, "step": 5100 }, { "epoch": 0.39706097026908665, "eval_loss": 0.025263095274567604, "eval_runtime": 167.3777, "eval_samples_per_second": 17.207, "eval_steps_per_second": 0.615, "step": 5100 }, { "epoch": 0.3971388253612963, "grad_norm": 1.4809861724593367, "learning_rate": 2.707881175993988e-05, "loss": 0.1874, "step": 5101 }, { "epoch": 0.3972166804535059, "grad_norm": 1.3617351297514755, "learning_rate": 2.7074142864216546e-05, "loss": 0.1648, "step": 5102 }, { "epoch": 0.39729453554571553, "grad_norm": 1.3555087778312556, "learning_rate": 2.7069473527807373e-05, "loss": 0.1814, "step": 5103 }, { "epoch": 0.3973723906379252, "grad_norm": 1.314533772151398, "learning_rate": 2.706480375100323e-05, "loss": 0.1894, "step": 5104 }, { "epoch": 0.39745024573013477, "grad_norm": 1.451401711518249, "learning_rate": 2.7060133534095014e-05, "loss": 0.2184, "step": 5105 }, { "epoch": 0.3975281008223444, "grad_norm": 1.5644533554336841, "learning_rate": 2.7055462877373666e-05, "loss": 0.2111, "step": 5106 }, { "epoch": 0.39760595591455405, "grad_norm": 1.4229304786648223, "learning_rate": 2.7050791781130142e-05, "loss": 0.2117, "step": 5107 }, { "epoch": 0.39768381100676364, "grad_norm": 1.3531819164936145, "learning_rate": 2.7046120245655435e-05, "loss": 0.1994, "step": 5108 }, { "epoch": 0.3977616660989733, "grad_norm": 1.2626726254452891, "learning_rate": 2.704144827124056e-05, "loss": 0.1919, "step": 5109 }, { "epoch": 0.39783952119118293, "grad_norm": 1.3928629915589796, "learning_rate": 2.703677585817655e-05, "loss": 0.194, "step": 5110 }, { "epoch": 0.3979173762833925, "grad_norm": 1.3668133135610876, "learning_rate": 2.703210300675448e-05, "loss": 0.1859, "step": 5111 }, { "epoch": 0.39799523137560217, "grad_norm": 1.3796928461087694, "learning_rate": 2.7027429717265444e-05, "loss": 0.1929, "step": 5112 }, { "epoch": 0.3980730864678118, "grad_norm": 1.3065445416994321, "learning_rate": 2.702275599000058e-05, "loss": 0.2017, "step": 5113 }, { "epoch": 0.3981509415600214, "grad_norm": 1.6731881487349787, "learning_rate": 2.701808182525102e-05, "loss": 0.2265, "step": 5114 }, { "epoch": 0.39822879665223104, "grad_norm": 1.3693280980700937, "learning_rate": 2.7013407223307945e-05, "loss": 0.1938, "step": 5115 }, { "epoch": 0.3983066517444407, "grad_norm": 1.3648498048169015, "learning_rate": 2.7008732184462575e-05, "loss": 0.1994, "step": 5116 }, { "epoch": 0.3983845068366503, "grad_norm": 1.3380557501973183, "learning_rate": 2.700405670900613e-05, "loss": 0.1664, "step": 5117 }, { "epoch": 0.3984623619288599, "grad_norm": 1.3806457948428357, "learning_rate": 2.6999380797229876e-05, "loss": 0.2428, "step": 5118 }, { "epoch": 0.3985402170210695, "grad_norm": 1.5164180352212517, "learning_rate": 2.6994704449425096e-05, "loss": 0.2256, "step": 5119 }, { "epoch": 0.39861807211327915, "grad_norm": 1.4658937086362074, "learning_rate": 2.6990027665883105e-05, "loss": 0.2142, "step": 5120 }, { "epoch": 0.3986959272054888, "grad_norm": 1.4915019330887642, "learning_rate": 2.6985350446895248e-05, "loss": 0.238, "step": 5121 }, { "epoch": 0.3987737822976984, "grad_norm": 1.4854262189621623, "learning_rate": 2.6980672792752895e-05, "loss": 0.2159, "step": 5122 }, { "epoch": 0.39885163738990803, "grad_norm": 1.3612480969519953, "learning_rate": 2.6975994703747435e-05, "loss": 0.1923, "step": 5123 }, { "epoch": 0.3989294924821177, "grad_norm": 1.3530429611560817, "learning_rate": 2.69713161801703e-05, "loss": 0.1831, "step": 5124 }, { "epoch": 0.39900734757432726, "grad_norm": 1.4333895887568315, "learning_rate": 2.696663722231293e-05, "loss": 0.2022, "step": 5125 }, { "epoch": 0.3990852026665369, "grad_norm": 1.3885725605618329, "learning_rate": 2.6961957830466805e-05, "loss": 0.1949, "step": 5126 }, { "epoch": 0.39916305775874655, "grad_norm": 1.4521077795965087, "learning_rate": 2.695727800492344e-05, "loss": 0.2069, "step": 5127 }, { "epoch": 0.39924091285095614, "grad_norm": 1.4275891071071074, "learning_rate": 2.6952597745974353e-05, "loss": 0.2078, "step": 5128 }, { "epoch": 0.3993187679431658, "grad_norm": 1.541926172911954, "learning_rate": 2.6947917053911106e-05, "loss": 0.211, "step": 5129 }, { "epoch": 0.39939662303537543, "grad_norm": 1.4401206119065968, "learning_rate": 2.694323592902528e-05, "loss": 0.2094, "step": 5130 }, { "epoch": 0.399474478127585, "grad_norm": 1.417460746892604, "learning_rate": 2.69385543716085e-05, "loss": 0.2043, "step": 5131 }, { "epoch": 0.39955233321979466, "grad_norm": 1.4032440666741497, "learning_rate": 2.6933872381952394e-05, "loss": 0.2019, "step": 5132 }, { "epoch": 0.3996301883120043, "grad_norm": 1.4123303079458316, "learning_rate": 2.6929189960348628e-05, "loss": 0.1878, "step": 5133 }, { "epoch": 0.3997080434042139, "grad_norm": 1.3218570270280663, "learning_rate": 2.69245071070889e-05, "loss": 0.199, "step": 5134 }, { "epoch": 0.39978589849642354, "grad_norm": 1.3415525929319143, "learning_rate": 2.6919823822464925e-05, "loss": 0.1856, "step": 5135 }, { "epoch": 0.39986375358863313, "grad_norm": 1.4433376900355974, "learning_rate": 2.691514010676845e-05, "loss": 0.2036, "step": 5136 }, { "epoch": 0.3999416086808428, "grad_norm": 1.4472833620737267, "learning_rate": 2.6910455960291257e-05, "loss": 0.2037, "step": 5137 }, { "epoch": 0.4000194637730524, "grad_norm": 1.3559430088881532, "learning_rate": 2.690577138332514e-05, "loss": 0.1818, "step": 5138 }, { "epoch": 0.400097318865262, "grad_norm": 1.3204729984238956, "learning_rate": 2.6901086376161925e-05, "loss": 0.1847, "step": 5139 }, { "epoch": 0.40017517395747165, "grad_norm": 1.2662334499955814, "learning_rate": 2.689640093909346e-05, "loss": 0.1817, "step": 5140 }, { "epoch": 0.4002530290496813, "grad_norm": 1.3061577078996416, "learning_rate": 2.6891715072411633e-05, "loss": 0.185, "step": 5141 }, { "epoch": 0.4003308841418909, "grad_norm": 1.5612247586798351, "learning_rate": 2.6887028776408365e-05, "loss": 0.2431, "step": 5142 }, { "epoch": 0.40040873923410053, "grad_norm": 1.3798550520654342, "learning_rate": 2.688234205137556e-05, "loss": 0.2193, "step": 5143 }, { "epoch": 0.4004865943263102, "grad_norm": 1.2728568715805335, "learning_rate": 2.68776548976052e-05, "loss": 0.1987, "step": 5144 }, { "epoch": 0.40056444941851976, "grad_norm": 1.3356653786846544, "learning_rate": 2.6872967315389274e-05, "loss": 0.2061, "step": 5145 }, { "epoch": 0.4006423045107294, "grad_norm": 1.4279670307510954, "learning_rate": 2.686827930501978e-05, "loss": 0.2588, "step": 5146 }, { "epoch": 0.40072015960293905, "grad_norm": 1.3392369903869576, "learning_rate": 2.6863590866788775e-05, "loss": 0.2228, "step": 5147 }, { "epoch": 0.40079801469514864, "grad_norm": 1.3694389379520786, "learning_rate": 2.6858902000988315e-05, "loss": 0.1778, "step": 5148 }, { "epoch": 0.4008758697873583, "grad_norm": 1.3588196752371318, "learning_rate": 2.685421270791051e-05, "loss": 0.1837, "step": 5149 }, { "epoch": 0.40095372487956793, "grad_norm": 1.2424727898530563, "learning_rate": 2.6849522987847466e-05, "loss": 0.1512, "step": 5150 }, { "epoch": 0.40095372487956793, "eval_loss": 0.02471284754574299, "eval_runtime": 162.9007, "eval_samples_per_second": 17.679, "eval_steps_per_second": 0.632, "step": 5150 }, { "epoch": 0.4010315799717775, "grad_norm": 1.4272063367228978, "learning_rate": 2.6844832841091335e-05, "loss": 0.1879, "step": 5151 }, { "epoch": 0.40110943506398716, "grad_norm": 1.2697080620267125, "learning_rate": 2.684014226793429e-05, "loss": 0.2095, "step": 5152 }, { "epoch": 0.40118729015619675, "grad_norm": 1.3961461643099553, "learning_rate": 2.6835451268668534e-05, "loss": 0.187, "step": 5153 }, { "epoch": 0.4012651452484064, "grad_norm": 1.334474271452915, "learning_rate": 2.683075984358629e-05, "loss": 0.1734, "step": 5154 }, { "epoch": 0.40134300034061604, "grad_norm": 1.3720185353008945, "learning_rate": 2.6826067992979824e-05, "loss": 0.202, "step": 5155 }, { "epoch": 0.40142085543282563, "grad_norm": 1.3364968414212202, "learning_rate": 2.6821375717141402e-05, "loss": 0.2214, "step": 5156 }, { "epoch": 0.4014987105250353, "grad_norm": 1.363371486251566, "learning_rate": 2.6816683016363335e-05, "loss": 0.1981, "step": 5157 }, { "epoch": 0.4015765656172449, "grad_norm": 1.347459907513897, "learning_rate": 2.681198989093796e-05, "loss": 0.1842, "step": 5158 }, { "epoch": 0.4016544207094545, "grad_norm": 1.312085890839985, "learning_rate": 2.6807296341157633e-05, "loss": 0.2084, "step": 5159 }, { "epoch": 0.40173227580166415, "grad_norm": 1.2752351930478918, "learning_rate": 2.6802602367314742e-05, "loss": 0.2047, "step": 5160 }, { "epoch": 0.4018101308938738, "grad_norm": 1.3600887890516282, "learning_rate": 2.6797907969701697e-05, "loss": 0.2152, "step": 5161 }, { "epoch": 0.4018879859860834, "grad_norm": 1.3892964939940093, "learning_rate": 2.6793213148610943e-05, "loss": 0.1907, "step": 5162 }, { "epoch": 0.40196584107829303, "grad_norm": 1.3089762577400748, "learning_rate": 2.6788517904334935e-05, "loss": 0.2205, "step": 5163 }, { "epoch": 0.4020436961705027, "grad_norm": 1.4438797571859532, "learning_rate": 2.6783822237166172e-05, "loss": 0.177, "step": 5164 }, { "epoch": 0.40212155126271226, "grad_norm": 1.347997600887066, "learning_rate": 2.6779126147397167e-05, "loss": 0.1936, "step": 5165 }, { "epoch": 0.4021994063549219, "grad_norm": 1.2614564895287375, "learning_rate": 2.6774429635320472e-05, "loss": 0.1747, "step": 5166 }, { "epoch": 0.40227726144713155, "grad_norm": 1.3567622403913646, "learning_rate": 2.6769732701228656e-05, "loss": 0.1775, "step": 5167 }, { "epoch": 0.40235511653934114, "grad_norm": 1.3240034495928583, "learning_rate": 2.676503534541431e-05, "loss": 0.1692, "step": 5168 }, { "epoch": 0.4024329716315508, "grad_norm": 1.3625755917237041, "learning_rate": 2.6760337568170056e-05, "loss": 0.1636, "step": 5169 }, { "epoch": 0.40251082672376043, "grad_norm": 1.504586094137955, "learning_rate": 2.6755639369788552e-05, "loss": 0.2062, "step": 5170 }, { "epoch": 0.40258868181597, "grad_norm": 1.3218487733028281, "learning_rate": 2.6750940750562472e-05, "loss": 0.1942, "step": 5171 }, { "epoch": 0.40266653690817966, "grad_norm": 1.3281265640088888, "learning_rate": 2.6746241710784513e-05, "loss": 0.1826, "step": 5172 }, { "epoch": 0.40274439200038925, "grad_norm": 1.3717420223445955, "learning_rate": 2.674154225074741e-05, "loss": 0.18, "step": 5173 }, { "epoch": 0.4028222470925989, "grad_norm": 1.2623167695782402, "learning_rate": 2.67368423707439e-05, "loss": 0.1602, "step": 5174 }, { "epoch": 0.40290010218480854, "grad_norm": 1.4767721809626946, "learning_rate": 2.673214207106679e-05, "loss": 0.2271, "step": 5175 }, { "epoch": 0.40297795727701813, "grad_norm": 1.305005716149057, "learning_rate": 2.6727441352008868e-05, "loss": 0.1904, "step": 5176 }, { "epoch": 0.4030558123692278, "grad_norm": 1.4836534405731125, "learning_rate": 2.672274021386297e-05, "loss": 0.2391, "step": 5177 }, { "epoch": 0.4031336674614374, "grad_norm": 1.4791674321819812, "learning_rate": 2.671803865692196e-05, "loss": 0.2038, "step": 5178 }, { "epoch": 0.403211522553647, "grad_norm": 1.304413229249491, "learning_rate": 2.6713336681478713e-05, "loss": 0.203, "step": 5179 }, { "epoch": 0.40328937764585665, "grad_norm": 1.3722033867565695, "learning_rate": 2.6708634287826156e-05, "loss": 0.1809, "step": 5180 }, { "epoch": 0.4033672327380663, "grad_norm": 1.2693463595033037, "learning_rate": 2.670393147625721e-05, "loss": 0.1876, "step": 5181 }, { "epoch": 0.4034450878302759, "grad_norm": 1.297948301231107, "learning_rate": 2.6699228247064844e-05, "loss": 0.2081, "step": 5182 }, { "epoch": 0.40352294292248553, "grad_norm": 1.2886155678289495, "learning_rate": 2.669452460054205e-05, "loss": 0.2065, "step": 5183 }, { "epoch": 0.4036007980146952, "grad_norm": 1.3614471415650147, "learning_rate": 2.668982053698183e-05, "loss": 0.2269, "step": 5184 }, { "epoch": 0.40367865310690476, "grad_norm": 1.3123470466400082, "learning_rate": 2.668511605667725e-05, "loss": 0.1996, "step": 5185 }, { "epoch": 0.4037565081991144, "grad_norm": 1.4685463034785695, "learning_rate": 2.6680411159921358e-05, "loss": 0.2342, "step": 5186 }, { "epoch": 0.40383436329132405, "grad_norm": 1.3215926644889857, "learning_rate": 2.6675705847007248e-05, "loss": 0.1959, "step": 5187 }, { "epoch": 0.40391221838353364, "grad_norm": 1.3896371832995447, "learning_rate": 2.667100011822804e-05, "loss": 0.1756, "step": 5188 }, { "epoch": 0.4039900734757433, "grad_norm": 1.3093789134725056, "learning_rate": 2.666629397387689e-05, "loss": 0.1921, "step": 5189 }, { "epoch": 0.40406792856795287, "grad_norm": 1.2846785174545963, "learning_rate": 2.6661587414246955e-05, "loss": 0.1789, "step": 5190 }, { "epoch": 0.4041457836601625, "grad_norm": 1.328258455546114, "learning_rate": 2.665688043963144e-05, "loss": 0.1586, "step": 5191 }, { "epoch": 0.40422363875237216, "grad_norm": 1.2941276221619484, "learning_rate": 2.6652173050323554e-05, "loss": 0.1752, "step": 5192 }, { "epoch": 0.40430149384458175, "grad_norm": 1.4208420945187437, "learning_rate": 2.6647465246616563e-05, "loss": 0.1715, "step": 5193 }, { "epoch": 0.4043793489367914, "grad_norm": 1.305086542301395, "learning_rate": 2.6642757028803732e-05, "loss": 0.1758, "step": 5194 }, { "epoch": 0.40445720402900104, "grad_norm": 1.298421396256419, "learning_rate": 2.6638048397178363e-05, "loss": 0.1908, "step": 5195 }, { "epoch": 0.4045350591212106, "grad_norm": 1.3576346518927591, "learning_rate": 2.663333935203378e-05, "loss": 0.1749, "step": 5196 }, { "epoch": 0.40461291421342027, "grad_norm": 1.3427389689585498, "learning_rate": 2.6628629893663328e-05, "loss": 0.1756, "step": 5197 }, { "epoch": 0.4046907693056299, "grad_norm": 1.3231290119144143, "learning_rate": 2.6623920022360398e-05, "loss": 0.1992, "step": 5198 }, { "epoch": 0.4047686243978395, "grad_norm": 1.271310647676467, "learning_rate": 2.6619209738418382e-05, "loss": 0.1724, "step": 5199 }, { "epoch": 0.40484647949004915, "grad_norm": 1.3244282292644423, "learning_rate": 2.6614499042130715e-05, "loss": 0.1795, "step": 5200 }, { "epoch": 0.40484647949004915, "eval_loss": 0.024223675951361656, "eval_runtime": 162.2111, "eval_samples_per_second": 17.755, "eval_steps_per_second": 0.635, "step": 5200 }, { "epoch": 0.4049243345822588, "grad_norm": 1.2725454307029374, "learning_rate": 2.660978793379085e-05, "loss": 0.182, "step": 5201 }, { "epoch": 0.4050021896744684, "grad_norm": 1.50263363303428, "learning_rate": 2.660507641369226e-05, "loss": 0.2909, "step": 5202 }, { "epoch": 0.405080044766678, "grad_norm": 1.407203645869642, "learning_rate": 2.660036448212846e-05, "loss": 0.1935, "step": 5203 }, { "epoch": 0.40515789985888767, "grad_norm": 1.29977424511932, "learning_rate": 2.6595652139392982e-05, "loss": 0.2128, "step": 5204 }, { "epoch": 0.40523575495109726, "grad_norm": 1.3171029523170148, "learning_rate": 2.659093938577937e-05, "loss": 0.1907, "step": 5205 }, { "epoch": 0.4053136100433069, "grad_norm": 1.432410990045999, "learning_rate": 2.6586226221581214e-05, "loss": 0.1978, "step": 5206 }, { "epoch": 0.4053914651355165, "grad_norm": 1.2899549992647703, "learning_rate": 2.6581512647092123e-05, "loss": 0.1671, "step": 5207 }, { "epoch": 0.40546932022772614, "grad_norm": 1.3705344924175693, "learning_rate": 2.6576798662605737e-05, "loss": 0.2277, "step": 5208 }, { "epoch": 0.4055471753199358, "grad_norm": 1.3042748478093928, "learning_rate": 2.6572084268415707e-05, "loss": 0.1773, "step": 5209 }, { "epoch": 0.40562503041214537, "grad_norm": 1.4377251627876193, "learning_rate": 2.656736946481571e-05, "loss": 0.2, "step": 5210 }, { "epoch": 0.405702885504355, "grad_norm": 1.5484611123245384, "learning_rate": 2.6562654252099468e-05, "loss": 0.2045, "step": 5211 }, { "epoch": 0.40578074059656466, "grad_norm": 1.3644125353068028, "learning_rate": 2.6557938630560714e-05, "loss": 0.1835, "step": 5212 }, { "epoch": 0.40585859568877425, "grad_norm": 1.3288976618980437, "learning_rate": 2.655322260049321e-05, "loss": 0.1793, "step": 5213 }, { "epoch": 0.4059364507809839, "grad_norm": 1.4534010972968792, "learning_rate": 2.6548506162190744e-05, "loss": 0.2262, "step": 5214 }, { "epoch": 0.40601430587319354, "grad_norm": 1.330741414368252, "learning_rate": 2.6543789315947116e-05, "loss": 0.1793, "step": 5215 }, { "epoch": 0.4060921609654031, "grad_norm": 1.4151353386821275, "learning_rate": 2.653907206205618e-05, "loss": 0.166, "step": 5216 }, { "epoch": 0.40617001605761277, "grad_norm": 1.3839043824397899, "learning_rate": 2.6534354400811788e-05, "loss": 0.2172, "step": 5217 }, { "epoch": 0.4062478711498224, "grad_norm": 1.3037273463925774, "learning_rate": 2.6529636332507835e-05, "loss": 0.2109, "step": 5218 }, { "epoch": 0.406325726242032, "grad_norm": 1.4597282851542657, "learning_rate": 2.6524917857438226e-05, "loss": 0.2209, "step": 5219 }, { "epoch": 0.40640358133424165, "grad_norm": 1.3688231682767784, "learning_rate": 2.65201989758969e-05, "loss": 0.1976, "step": 5220 }, { "epoch": 0.4064814364264513, "grad_norm": 1.3717169748633837, "learning_rate": 2.651547968817783e-05, "loss": 0.2048, "step": 5221 }, { "epoch": 0.4065592915186609, "grad_norm": 1.1903186583727228, "learning_rate": 2.6510759994575002e-05, "loss": 0.156, "step": 5222 }, { "epoch": 0.4066371466108705, "grad_norm": 1.3036110670095462, "learning_rate": 2.6506039895382428e-05, "loss": 0.1846, "step": 5223 }, { "epoch": 0.4067150017030801, "grad_norm": 1.3543292946800274, "learning_rate": 2.6501319390894153e-05, "loss": 0.2068, "step": 5224 }, { "epoch": 0.40679285679528976, "grad_norm": 1.3941061141753994, "learning_rate": 2.649659848140423e-05, "loss": 0.2203, "step": 5225 }, { "epoch": 0.4068707118874994, "grad_norm": 1.2854229923729033, "learning_rate": 2.6491877167206767e-05, "loss": 0.1671, "step": 5226 }, { "epoch": 0.406948566979709, "grad_norm": 1.3724444432966385, "learning_rate": 2.6487155448595872e-05, "loss": 0.1783, "step": 5227 }, { "epoch": 0.40702642207191864, "grad_norm": 1.4212142322378325, "learning_rate": 2.6482433325865683e-05, "loss": 0.228, "step": 5228 }, { "epoch": 0.4071042771641283, "grad_norm": 1.3181705089191784, "learning_rate": 2.6477710799310362e-05, "loss": 0.1947, "step": 5229 }, { "epoch": 0.40718213225633787, "grad_norm": 1.2882844835508689, "learning_rate": 2.6472987869224106e-05, "loss": 0.1991, "step": 5230 }, { "epoch": 0.4072599873485475, "grad_norm": 1.356422117081712, "learning_rate": 2.646826453590114e-05, "loss": 0.2378, "step": 5231 }, { "epoch": 0.40733784244075716, "grad_norm": 1.402743880968666, "learning_rate": 2.6463540799635695e-05, "loss": 0.2228, "step": 5232 }, { "epoch": 0.40741569753296675, "grad_norm": 1.3400673203710987, "learning_rate": 2.6458816660722035e-05, "loss": 0.1957, "step": 5233 }, { "epoch": 0.4074935526251764, "grad_norm": 1.321959768352143, "learning_rate": 2.645409211945446e-05, "loss": 0.1875, "step": 5234 }, { "epoch": 0.40757140771738604, "grad_norm": 1.3734545859512213, "learning_rate": 2.6449367176127282e-05, "loss": 0.1806, "step": 5235 }, { "epoch": 0.4076492628095956, "grad_norm": 1.2841744701882662, "learning_rate": 2.644464183103484e-05, "loss": 0.1797, "step": 5236 }, { "epoch": 0.40772711790180527, "grad_norm": 1.3368425117795795, "learning_rate": 2.6439916084471507e-05, "loss": 0.2099, "step": 5237 }, { "epoch": 0.4078049729940149, "grad_norm": 1.2647299706470523, "learning_rate": 2.6435189936731674e-05, "loss": 0.1849, "step": 5238 }, { "epoch": 0.4078828280862245, "grad_norm": 1.3036850404445797, "learning_rate": 2.6430463388109754e-05, "loss": 0.1655, "step": 5239 }, { "epoch": 0.40796068317843415, "grad_norm": 1.3733661749254753, "learning_rate": 2.6425736438900193e-05, "loss": 0.2057, "step": 5240 }, { "epoch": 0.4080385382706438, "grad_norm": 1.300266846578988, "learning_rate": 2.642100908939745e-05, "loss": 0.1894, "step": 5241 }, { "epoch": 0.4081163933628534, "grad_norm": 1.324059616179741, "learning_rate": 2.641628133989603e-05, "loss": 0.1874, "step": 5242 }, { "epoch": 0.408194248455063, "grad_norm": 1.352731496893061, "learning_rate": 2.6411553190690438e-05, "loss": 0.1825, "step": 5243 }, { "epoch": 0.4082721035472726, "grad_norm": 1.2204733666570322, "learning_rate": 2.6406824642075214e-05, "loss": 0.1547, "step": 5244 }, { "epoch": 0.40834995863948226, "grad_norm": 1.3069847540190946, "learning_rate": 2.640209569434494e-05, "loss": 0.1818, "step": 5245 }, { "epoch": 0.4084278137316919, "grad_norm": 1.3282791166499646, "learning_rate": 2.6397366347794187e-05, "loss": 0.1733, "step": 5246 }, { "epoch": 0.4085056688239015, "grad_norm": 1.2638143271190718, "learning_rate": 2.639263660271759e-05, "loss": 0.2081, "step": 5247 }, { "epoch": 0.40858352391611114, "grad_norm": 1.3160411156985876, "learning_rate": 2.638790645940977e-05, "loss": 0.2212, "step": 5248 }, { "epoch": 0.4086613790083208, "grad_norm": 1.4421435896660266, "learning_rate": 2.6383175918165412e-05, "loss": 0.2052, "step": 5249 }, { "epoch": 0.40873923410053037, "grad_norm": 1.395693531135515, "learning_rate": 2.6378444979279193e-05, "loss": 0.2031, "step": 5250 }, { "epoch": 0.40873923410053037, "eval_loss": 0.023784620687365532, "eval_runtime": 162.7816, "eval_samples_per_second": 17.692, "eval_steps_per_second": 0.633, "step": 5250 }, { "epoch": 0.40881708919274, "grad_norm": 1.248712252612365, "learning_rate": 2.6373713643045838e-05, "loss": 0.1581, "step": 5251 }, { "epoch": 0.40889494428494966, "grad_norm": 1.485410981226993, "learning_rate": 2.636898190976008e-05, "loss": 0.2001, "step": 5252 }, { "epoch": 0.40897279937715925, "grad_norm": 1.1579776873198395, "learning_rate": 2.6364249779716686e-05, "loss": 0.1626, "step": 5253 }, { "epoch": 0.4090506544693689, "grad_norm": 1.3305363661800833, "learning_rate": 2.6359517253210437e-05, "loss": 0.198, "step": 5254 }, { "epoch": 0.40912850956157853, "grad_norm": 1.3510452021972053, "learning_rate": 2.6354784330536166e-05, "loss": 0.1938, "step": 5255 }, { "epoch": 0.4092063646537881, "grad_norm": 1.4293905424196938, "learning_rate": 2.63500510119887e-05, "loss": 0.1817, "step": 5256 }, { "epoch": 0.40928421974599777, "grad_norm": 1.533837303806768, "learning_rate": 2.6345317297862903e-05, "loss": 0.2192, "step": 5257 }, { "epoch": 0.4093620748382074, "grad_norm": 1.3345780577058037, "learning_rate": 2.6340583188453663e-05, "loss": 0.1747, "step": 5258 }, { "epoch": 0.409439929930417, "grad_norm": 1.394650210895609, "learning_rate": 2.6335848684055894e-05, "loss": 0.1777, "step": 5259 }, { "epoch": 0.40951778502262665, "grad_norm": 1.2232495752897954, "learning_rate": 2.633111378496453e-05, "loss": 0.1503, "step": 5260 }, { "epoch": 0.40959564011483623, "grad_norm": 1.4246848068470255, "learning_rate": 2.632637849147454e-05, "loss": 0.1759, "step": 5261 }, { "epoch": 0.4096734952070459, "grad_norm": 1.3109907555893021, "learning_rate": 2.632164280388091e-05, "loss": 0.1999, "step": 5262 }, { "epoch": 0.4097513502992555, "grad_norm": 1.3726748724801405, "learning_rate": 2.631690672247864e-05, "loss": 0.1851, "step": 5263 }, { "epoch": 0.4098292053914651, "grad_norm": 1.3702517639692409, "learning_rate": 2.6312170247562777e-05, "loss": 0.1828, "step": 5264 }, { "epoch": 0.40990706048367476, "grad_norm": 1.3136788211742216, "learning_rate": 2.6307433379428373e-05, "loss": 0.2013, "step": 5265 }, { "epoch": 0.4099849155758844, "grad_norm": 1.3746984740214592, "learning_rate": 2.6302696118370524e-05, "loss": 0.1791, "step": 5266 }, { "epoch": 0.410062770668094, "grad_norm": 1.3948813792664367, "learning_rate": 2.629795846468433e-05, "loss": 0.2237, "step": 5267 }, { "epoch": 0.41014062576030363, "grad_norm": 1.1798255466187433, "learning_rate": 2.6293220418664925e-05, "loss": 0.1587, "step": 5268 }, { "epoch": 0.4102184808525133, "grad_norm": 1.2953908864418344, "learning_rate": 2.6288481980607463e-05, "loss": 0.1715, "step": 5269 }, { "epoch": 0.41029633594472287, "grad_norm": 1.4184869644709563, "learning_rate": 2.6283743150807143e-05, "loss": 0.1905, "step": 5270 }, { "epoch": 0.4103741910369325, "grad_norm": 1.2886211802500507, "learning_rate": 2.627900392955916e-05, "loss": 0.1679, "step": 5271 }, { "epoch": 0.41045204612914216, "grad_norm": 1.3415393801888076, "learning_rate": 2.6274264317158736e-05, "loss": 0.163, "step": 5272 }, { "epoch": 0.41052990122135175, "grad_norm": 1.4145124471596466, "learning_rate": 2.6269524313901146e-05, "loss": 0.1772, "step": 5273 }, { "epoch": 0.4106077563135614, "grad_norm": 1.4416417017932486, "learning_rate": 2.6264783920081655e-05, "loss": 0.1905, "step": 5274 }, { "epoch": 0.41068561140577103, "grad_norm": 1.2659907994412483, "learning_rate": 2.6260043135995575e-05, "loss": 0.1676, "step": 5275 }, { "epoch": 0.4107634664979806, "grad_norm": 1.3042947884306988, "learning_rate": 2.625530196193824e-05, "loss": 0.1699, "step": 5276 }, { "epoch": 0.41084132159019027, "grad_norm": 1.265912383353021, "learning_rate": 2.6250560398204984e-05, "loss": 0.1608, "step": 5277 }, { "epoch": 0.41091917668239986, "grad_norm": 1.3433695962618941, "learning_rate": 2.6245818445091204e-05, "loss": 0.2035, "step": 5278 }, { "epoch": 0.4109970317746095, "grad_norm": 1.328563390217805, "learning_rate": 2.6241076102892283e-05, "loss": 0.1639, "step": 5279 }, { "epoch": 0.41107488686681914, "grad_norm": 1.3075213265882288, "learning_rate": 2.6236333371903665e-05, "loss": 0.2163, "step": 5280 }, { "epoch": 0.41115274195902873, "grad_norm": 1.2969001902814343, "learning_rate": 2.623159025242079e-05, "loss": 0.1961, "step": 5281 }, { "epoch": 0.4112305970512384, "grad_norm": 1.227388906899161, "learning_rate": 2.622684674473913e-05, "loss": 0.1827, "step": 5282 }, { "epoch": 0.411308452143448, "grad_norm": 1.348377302916012, "learning_rate": 2.6222102849154185e-05, "loss": 0.2, "step": 5283 }, { "epoch": 0.4113863072356576, "grad_norm": 1.4157651473274693, "learning_rate": 2.6217358565961482e-05, "loss": 0.1861, "step": 5284 }, { "epoch": 0.41146416232786726, "grad_norm": 1.3899256171338705, "learning_rate": 2.621261389545657e-05, "loss": 0.1844, "step": 5285 }, { "epoch": 0.4115420174200769, "grad_norm": 1.4082934459075396, "learning_rate": 2.6207868837935006e-05, "loss": 0.194, "step": 5286 }, { "epoch": 0.4116198725122865, "grad_norm": 1.3056443762539534, "learning_rate": 2.6203123393692392e-05, "loss": 0.1707, "step": 5287 }, { "epoch": 0.41169772760449613, "grad_norm": 1.3340390416747285, "learning_rate": 2.6198377563024355e-05, "loss": 0.183, "step": 5288 }, { "epoch": 0.4117755826967058, "grad_norm": 1.3300510291659926, "learning_rate": 2.6193631346226527e-05, "loss": 0.174, "step": 5289 }, { "epoch": 0.41185343778891537, "grad_norm": 1.2788152455239152, "learning_rate": 2.6188884743594577e-05, "loss": 0.1879, "step": 5290 }, { "epoch": 0.411931292881125, "grad_norm": 1.3178555225298252, "learning_rate": 2.6184137755424206e-05, "loss": 0.1673, "step": 5291 }, { "epoch": 0.41200914797333466, "grad_norm": 1.3289016007263479, "learning_rate": 2.617939038201111e-05, "loss": 0.2176, "step": 5292 }, { "epoch": 0.41208700306554424, "grad_norm": 1.3251576716343778, "learning_rate": 2.6174642623651047e-05, "loss": 0.1857, "step": 5293 }, { "epoch": 0.4121648581577539, "grad_norm": 1.2918940501105183, "learning_rate": 2.6169894480639764e-05, "loss": 0.1861, "step": 5294 }, { "epoch": 0.41224271324996353, "grad_norm": 1.270117425649657, "learning_rate": 2.6165145953273062e-05, "loss": 0.1707, "step": 5295 }, { "epoch": 0.4123205683421731, "grad_norm": 1.3008510891673963, "learning_rate": 2.6160397041846744e-05, "loss": 0.1755, "step": 5296 }, { "epoch": 0.41239842343438277, "grad_norm": 1.3458075215288126, "learning_rate": 2.615564774665665e-05, "loss": 0.1906, "step": 5297 }, { "epoch": 0.41247627852659235, "grad_norm": 1.2754245334378287, "learning_rate": 2.6150898067998628e-05, "loss": 0.1941, "step": 5298 }, { "epoch": 0.412554133618802, "grad_norm": 1.3021839212987982, "learning_rate": 2.6146148006168573e-05, "loss": 0.185, "step": 5299 }, { "epoch": 0.41263198871101164, "grad_norm": 1.354632134990883, "learning_rate": 2.6141397561462388e-05, "loss": 0.199, "step": 5300 }, { "epoch": 0.41263198871101164, "eval_loss": 0.023448335006833076, "eval_runtime": 162.5134, "eval_samples_per_second": 17.722, "eval_steps_per_second": 0.634, "step": 5300 }, { "epoch": 0.41270984380322123, "grad_norm": 1.2136526452668912, "learning_rate": 2.6136646734176e-05, "loss": 0.1875, "step": 5301 }, { "epoch": 0.4127876988954309, "grad_norm": 1.3032915076234473, "learning_rate": 2.6131895524605362e-05, "loss": 0.1886, "step": 5302 }, { "epoch": 0.4128655539876405, "grad_norm": 1.5818535199309096, "learning_rate": 2.6127143933046463e-05, "loss": 0.226, "step": 5303 }, { "epoch": 0.4129434090798501, "grad_norm": 1.2499656315299597, "learning_rate": 2.6122391959795294e-05, "loss": 0.1868, "step": 5304 }, { "epoch": 0.41302126417205975, "grad_norm": 1.1927609688453178, "learning_rate": 2.6117639605147882e-05, "loss": 0.1569, "step": 5305 }, { "epoch": 0.4130991192642694, "grad_norm": 1.366788631319098, "learning_rate": 2.6112886869400282e-05, "loss": 0.1587, "step": 5306 }, { "epoch": 0.413176974356479, "grad_norm": 1.3635523646492, "learning_rate": 2.6108133752848557e-05, "loss": 0.167, "step": 5307 }, { "epoch": 0.41325482944868863, "grad_norm": 1.37397869727835, "learning_rate": 2.6103380255788812e-05, "loss": 0.1821, "step": 5308 }, { "epoch": 0.4133326845408983, "grad_norm": 1.3915992024578958, "learning_rate": 2.609862637851717e-05, "loss": 0.1788, "step": 5309 }, { "epoch": 0.41341053963310787, "grad_norm": 1.3915629858360898, "learning_rate": 2.6093872121329766e-05, "loss": 0.1946, "step": 5310 }, { "epoch": 0.4134883947253175, "grad_norm": 1.3273353634103116, "learning_rate": 2.6089117484522772e-05, "loss": 0.1901, "step": 5311 }, { "epoch": 0.41356624981752715, "grad_norm": 1.2272064910101392, "learning_rate": 2.6084362468392384e-05, "loss": 0.1793, "step": 5312 }, { "epoch": 0.41364410490973674, "grad_norm": 1.2982414638522566, "learning_rate": 2.6079607073234807e-05, "loss": 0.1849, "step": 5313 }, { "epoch": 0.4137219600019464, "grad_norm": 1.3563978212648915, "learning_rate": 2.6074851299346295e-05, "loss": 0.2068, "step": 5314 }, { "epoch": 0.413799815094156, "grad_norm": 1.3022866323335973, "learning_rate": 2.6070095147023093e-05, "loss": 0.1767, "step": 5315 }, { "epoch": 0.4138776701863656, "grad_norm": 1.3232830669737845, "learning_rate": 2.60653386165615e-05, "loss": 0.1964, "step": 5316 }, { "epoch": 0.41395552527857526, "grad_norm": 1.2702737144197913, "learning_rate": 2.606058170825782e-05, "loss": 0.1795, "step": 5317 }, { "epoch": 0.41403338037078485, "grad_norm": 1.2998855620105563, "learning_rate": 2.605582442240839e-05, "loss": 0.1887, "step": 5318 }, { "epoch": 0.4141112354629945, "grad_norm": 1.2336873796351422, "learning_rate": 2.6051066759309556e-05, "loss": 0.1463, "step": 5319 }, { "epoch": 0.41418909055520414, "grad_norm": 1.2452607371848325, "learning_rate": 2.6046308719257705e-05, "loss": 0.1622, "step": 5320 }, { "epoch": 0.41426694564741373, "grad_norm": 1.2427062724475175, "learning_rate": 2.6041550302549247e-05, "loss": 0.1473, "step": 5321 }, { "epoch": 0.4143448007396234, "grad_norm": 1.4361282630134915, "learning_rate": 2.60367915094806e-05, "loss": 0.1569, "step": 5322 }, { "epoch": 0.414422655831833, "grad_norm": 1.3096717198619456, "learning_rate": 2.603203234034822e-05, "loss": 0.1497, "step": 5323 }, { "epoch": 0.4145005109240426, "grad_norm": 1.316695415581322, "learning_rate": 2.602727279544858e-05, "loss": 0.1856, "step": 5324 }, { "epoch": 0.41457836601625225, "grad_norm": 1.2661455047927064, "learning_rate": 2.6022512875078167e-05, "loss": 0.1927, "step": 5325 }, { "epoch": 0.4146562211084619, "grad_norm": 1.380733559802251, "learning_rate": 2.6017752579533517e-05, "loss": 0.1868, "step": 5326 }, { "epoch": 0.4147340762006715, "grad_norm": 1.3258442011024492, "learning_rate": 2.6012991909111164e-05, "loss": 0.1566, "step": 5327 }, { "epoch": 0.41481193129288113, "grad_norm": 1.3194959516104565, "learning_rate": 2.6008230864107683e-05, "loss": 0.2188, "step": 5328 }, { "epoch": 0.4148897863850908, "grad_norm": 1.3983249784272573, "learning_rate": 2.600346944481966e-05, "loss": 0.1609, "step": 5329 }, { "epoch": 0.41496764147730036, "grad_norm": 1.280873241601807, "learning_rate": 2.5998707651543704e-05, "loss": 0.1652, "step": 5330 }, { "epoch": 0.41504549656951, "grad_norm": 1.2802156329677021, "learning_rate": 2.5993945484576458e-05, "loss": 0.1786, "step": 5331 }, { "epoch": 0.4151233516617196, "grad_norm": 1.3572170528245953, "learning_rate": 2.5989182944214588e-05, "loss": 0.1924, "step": 5332 }, { "epoch": 0.41520120675392924, "grad_norm": 1.457936665608982, "learning_rate": 2.598442003075477e-05, "loss": 0.1932, "step": 5333 }, { "epoch": 0.4152790618461389, "grad_norm": 1.3333638696200905, "learning_rate": 2.597965674449371e-05, "loss": 0.19, "step": 5334 }, { "epoch": 0.4153569169383485, "grad_norm": 1.2327856331663358, "learning_rate": 2.597489308572815e-05, "loss": 0.1627, "step": 5335 }, { "epoch": 0.4154347720305581, "grad_norm": 1.333517746558366, "learning_rate": 2.5970129054754822e-05, "loss": 0.1619, "step": 5336 }, { "epoch": 0.41551262712276776, "grad_norm": 1.3295090839959989, "learning_rate": 2.5965364651870527e-05, "loss": 0.1672, "step": 5337 }, { "epoch": 0.41559048221497735, "grad_norm": 1.2801413524499394, "learning_rate": 2.596059987737205e-05, "loss": 0.1739, "step": 5338 }, { "epoch": 0.415668337307187, "grad_norm": 1.2351786985263866, "learning_rate": 2.595583473155622e-05, "loss": 0.157, "step": 5339 }, { "epoch": 0.41574619239939664, "grad_norm": 1.361718650198361, "learning_rate": 2.595106921471988e-05, "loss": 0.1742, "step": 5340 }, { "epoch": 0.41582404749160623, "grad_norm": 1.205098438469684, "learning_rate": 2.5946303327159897e-05, "loss": 0.1651, "step": 5341 }, { "epoch": 0.4159019025838159, "grad_norm": 1.28404301807656, "learning_rate": 2.594153706917317e-05, "loss": 0.1724, "step": 5342 }, { "epoch": 0.4159797576760255, "grad_norm": 1.2887940915885998, "learning_rate": 2.593677044105661e-05, "loss": 0.1703, "step": 5343 }, { "epoch": 0.4160576127682351, "grad_norm": 1.265647910451376, "learning_rate": 2.5932003443107156e-05, "loss": 0.1748, "step": 5344 }, { "epoch": 0.41613546786044475, "grad_norm": 1.401576385754046, "learning_rate": 2.5927236075621777e-05, "loss": 0.2178, "step": 5345 }, { "epoch": 0.4162133229526544, "grad_norm": 1.205766951485255, "learning_rate": 2.592246833889744e-05, "loss": 0.1652, "step": 5346 }, { "epoch": 0.416291178044864, "grad_norm": 1.2392051145373595, "learning_rate": 2.5917700233231173e-05, "loss": 0.1815, "step": 5347 }, { "epoch": 0.41636903313707363, "grad_norm": 1.2726151847394382, "learning_rate": 2.5912931758919998e-05, "loss": 0.2171, "step": 5348 }, { "epoch": 0.4164468882292833, "grad_norm": 1.2149760990740177, "learning_rate": 2.590816291626096e-05, "loss": 0.1918, "step": 5349 }, { "epoch": 0.41652474332149286, "grad_norm": 1.2485327472677954, "learning_rate": 2.5903393705551144e-05, "loss": 0.16, "step": 5350 }, { "epoch": 0.41652474332149286, "eval_loss": 0.022787043824791908, "eval_runtime": 167.924, "eval_samples_per_second": 17.151, "eval_steps_per_second": 0.613, "step": 5350 }, { "epoch": 0.4166025984137025, "grad_norm": 1.2174648945943731, "learning_rate": 2.5898624127087653e-05, "loss": 0.1655, "step": 5351 }, { "epoch": 0.4166804535059121, "grad_norm": 1.3984171705271249, "learning_rate": 2.5893854181167605e-05, "loss": 0.1648, "step": 5352 }, { "epoch": 0.41675830859812174, "grad_norm": 1.3066167681160186, "learning_rate": 2.5889083868088146e-05, "loss": 0.1633, "step": 5353 }, { "epoch": 0.4168361636903314, "grad_norm": 1.1726254456961105, "learning_rate": 2.5884313188146432e-05, "loss": 0.1569, "step": 5354 }, { "epoch": 0.416914018782541, "grad_norm": 1.4115230360285733, "learning_rate": 2.5879542141639674e-05, "loss": 0.1846, "step": 5355 }, { "epoch": 0.4169918738747506, "grad_norm": 1.3146407150252686, "learning_rate": 2.587477072886508e-05, "loss": 0.1869, "step": 5356 }, { "epoch": 0.41706972896696026, "grad_norm": 1.3567305351920076, "learning_rate": 2.586999895011988e-05, "loss": 0.1772, "step": 5357 }, { "epoch": 0.41714758405916985, "grad_norm": 1.3865016401831631, "learning_rate": 2.5865226805701336e-05, "loss": 0.1898, "step": 5358 }, { "epoch": 0.4172254391513795, "grad_norm": 1.2974667734551952, "learning_rate": 2.586045429590673e-05, "loss": 0.2, "step": 5359 }, { "epoch": 0.41730329424358914, "grad_norm": 1.2621561722092072, "learning_rate": 2.5855681421033368e-05, "loss": 0.1536, "step": 5360 }, { "epoch": 0.41738114933579873, "grad_norm": 1.1672965791553784, "learning_rate": 2.5850908181378582e-05, "loss": 0.1656, "step": 5361 }, { "epoch": 0.4174590044280084, "grad_norm": 1.344208508936003, "learning_rate": 2.5846134577239716e-05, "loss": 0.1952, "step": 5362 }, { "epoch": 0.417536859520218, "grad_norm": 1.292787521838445, "learning_rate": 2.5841360608914144e-05, "loss": 0.1715, "step": 5363 }, { "epoch": 0.4176147146124276, "grad_norm": 1.321321476796311, "learning_rate": 2.5836586276699258e-05, "loss": 0.1665, "step": 5364 }, { "epoch": 0.41769256970463725, "grad_norm": 1.2944581017782255, "learning_rate": 2.5831811580892487e-05, "loss": 0.1855, "step": 5365 }, { "epoch": 0.4177704247968469, "grad_norm": 1.3873042642921802, "learning_rate": 2.5827036521791266e-05, "loss": 0.2107, "step": 5366 }, { "epoch": 0.4178482798890565, "grad_norm": 1.3041968077989188, "learning_rate": 2.5822261099693062e-05, "loss": 0.1698, "step": 5367 }, { "epoch": 0.41792613498126613, "grad_norm": 1.2271410804499954, "learning_rate": 2.5817485314895354e-05, "loss": 0.1687, "step": 5368 }, { "epoch": 0.4180039900734757, "grad_norm": 1.2509036312813413, "learning_rate": 2.5812709167695653e-05, "loss": 0.1742, "step": 5369 }, { "epoch": 0.41808184516568536, "grad_norm": 1.286857568150171, "learning_rate": 2.5807932658391498e-05, "loss": 0.1722, "step": 5370 }, { "epoch": 0.418159700257895, "grad_norm": 1.2743840014951762, "learning_rate": 2.5803155787280442e-05, "loss": 0.1678, "step": 5371 }, { "epoch": 0.4182375553501046, "grad_norm": 1.2764668942144943, "learning_rate": 2.5798378554660047e-05, "loss": 0.1682, "step": 5372 }, { "epoch": 0.41831541044231424, "grad_norm": 1.2783573880451018, "learning_rate": 2.5793600960827934e-05, "loss": 0.1687, "step": 5373 }, { "epoch": 0.4183932655345239, "grad_norm": 1.4386931838992574, "learning_rate": 2.5788823006081702e-05, "loss": 0.1881, "step": 5374 }, { "epoch": 0.4184711206267335, "grad_norm": 1.2772901239044852, "learning_rate": 2.5784044690719013e-05, "loss": 0.1662, "step": 5375 }, { "epoch": 0.4185489757189431, "grad_norm": 1.237136757399337, "learning_rate": 2.5779266015037534e-05, "loss": 0.1702, "step": 5376 }, { "epoch": 0.41862683081115276, "grad_norm": 1.2424201169509645, "learning_rate": 2.5774486979334934e-05, "loss": 0.2362, "step": 5377 }, { "epoch": 0.41870468590336235, "grad_norm": 1.210274316683329, "learning_rate": 2.5769707583908946e-05, "loss": 0.1744, "step": 5378 }, { "epoch": 0.418782540995572, "grad_norm": 1.287702347850863, "learning_rate": 2.576492782905729e-05, "loss": 0.2108, "step": 5379 }, { "epoch": 0.41886039608778164, "grad_norm": 1.3459530551158188, "learning_rate": 2.5760147715077736e-05, "loss": 0.1776, "step": 5380 }, { "epoch": 0.41893825117999123, "grad_norm": 1.3378208934422942, "learning_rate": 2.575536724226805e-05, "loss": 0.1577, "step": 5381 }, { "epoch": 0.41901610627220087, "grad_norm": 1.393923335627694, "learning_rate": 2.5750586410926036e-05, "loss": 0.2147, "step": 5382 }, { "epoch": 0.4190939613644105, "grad_norm": 1.2940914703154136, "learning_rate": 2.5745805221349517e-05, "loss": 0.1768, "step": 5383 }, { "epoch": 0.4191718164566201, "grad_norm": 1.2844627141987597, "learning_rate": 2.574102367383634e-05, "loss": 0.149, "step": 5384 }, { "epoch": 0.41924967154882975, "grad_norm": 1.308151458326121, "learning_rate": 2.573624176868438e-05, "loss": 0.1659, "step": 5385 }, { "epoch": 0.41932752664103934, "grad_norm": 1.2520682121651852, "learning_rate": 2.5731459506191518e-05, "loss": 0.155, "step": 5386 }, { "epoch": 0.419405381733249, "grad_norm": 1.2992336806626759, "learning_rate": 2.5726676886655664e-05, "loss": 0.1765, "step": 5387 }, { "epoch": 0.4194832368254586, "grad_norm": 1.2794479478486736, "learning_rate": 2.572189391037476e-05, "loss": 0.1955, "step": 5388 }, { "epoch": 0.4195610919176682, "grad_norm": 1.1739612637910197, "learning_rate": 2.5717110577646765e-05, "loss": 0.154, "step": 5389 }, { "epoch": 0.41963894700987786, "grad_norm": 1.3230750990143856, "learning_rate": 2.571232688876965e-05, "loss": 0.1835, "step": 5390 }, { "epoch": 0.4197168021020875, "grad_norm": 1.5219281106477989, "learning_rate": 2.5707542844041422e-05, "loss": 0.1919, "step": 5391 }, { "epoch": 0.4197946571942971, "grad_norm": 1.3618206786142566, "learning_rate": 2.57027584437601e-05, "loss": 0.2273, "step": 5392 }, { "epoch": 0.41987251228650674, "grad_norm": 1.4500773819909183, "learning_rate": 2.5697973688223736e-05, "loss": 0.1853, "step": 5393 }, { "epoch": 0.4199503673787164, "grad_norm": 1.3819987763480246, "learning_rate": 2.5693188577730396e-05, "loss": 0.1783, "step": 5394 }, { "epoch": 0.42002822247092597, "grad_norm": 1.2696415791494158, "learning_rate": 2.568840311257816e-05, "loss": 0.2227, "step": 5395 }, { "epoch": 0.4201060775631356, "grad_norm": 1.3414095211582615, "learning_rate": 2.5683617293065152e-05, "loss": 0.1556, "step": 5396 }, { "epoch": 0.42018393265534526, "grad_norm": 1.3175070671740206, "learning_rate": 2.56788311194895e-05, "loss": 0.1687, "step": 5397 }, { "epoch": 0.42026178774755485, "grad_norm": 1.1493597291472635, "learning_rate": 2.5674044592149362e-05, "loss": 0.1482, "step": 5398 }, { "epoch": 0.4203396428397645, "grad_norm": 1.197065379226191, "learning_rate": 2.5669257711342922e-05, "loss": 0.1972, "step": 5399 }, { "epoch": 0.42041749793197414, "grad_norm": 1.1423170957688118, "learning_rate": 2.566447047736837e-05, "loss": 0.1769, "step": 5400 }, { "epoch": 0.42041749793197414, "eval_loss": 0.022654497995972633, "eval_runtime": 167.211, "eval_samples_per_second": 17.224, "eval_steps_per_second": 0.616, "step": 5400 }, { "epoch": 0.4204953530241837, "grad_norm": 1.2336784499227302, "learning_rate": 2.5659682890523937e-05, "loss": 0.1514, "step": 5401 }, { "epoch": 0.42057320811639337, "grad_norm": 1.2757388384966384, "learning_rate": 2.5654894951107852e-05, "loss": 0.1913, "step": 5402 }, { "epoch": 0.42065106320860296, "grad_norm": 1.2757181872621712, "learning_rate": 2.5650106659418405e-05, "loss": 0.1778, "step": 5403 }, { "epoch": 0.4207289183008126, "grad_norm": 1.293703270526175, "learning_rate": 2.5645318015753863e-05, "loss": 0.1696, "step": 5404 }, { "epoch": 0.42080677339302225, "grad_norm": 1.212585380403769, "learning_rate": 2.5640529020412544e-05, "loss": 0.1575, "step": 5405 }, { "epoch": 0.42088462848523184, "grad_norm": 1.390686218733641, "learning_rate": 2.5635739673692787e-05, "loss": 0.1836, "step": 5406 }, { "epoch": 0.4209624835774415, "grad_norm": 1.3241476913486365, "learning_rate": 2.5630949975892936e-05, "loss": 0.1937, "step": 5407 }, { "epoch": 0.4210403386696511, "grad_norm": 1.1464334919371164, "learning_rate": 2.5626159927311362e-05, "loss": 0.1442, "step": 5408 }, { "epoch": 0.4211181937618607, "grad_norm": 1.3342053538462464, "learning_rate": 2.562136952824648e-05, "loss": 0.2081, "step": 5409 }, { "epoch": 0.42119604885407036, "grad_norm": 1.6169157709574244, "learning_rate": 2.561657877899669e-05, "loss": 0.1867, "step": 5410 }, { "epoch": 0.42127390394628, "grad_norm": 1.3462249632122358, "learning_rate": 2.5611787679860447e-05, "loss": 0.2048, "step": 5411 }, { "epoch": 0.4213517590384896, "grad_norm": 1.2808173073999916, "learning_rate": 2.560699623113621e-05, "loss": 0.1579, "step": 5412 }, { "epoch": 0.42142961413069924, "grad_norm": 1.2748006605372824, "learning_rate": 2.5602204433122455e-05, "loss": 0.181, "step": 5413 }, { "epoch": 0.4215074692229089, "grad_norm": 1.3340238503994506, "learning_rate": 2.559741228611771e-05, "loss": 0.1461, "step": 5414 }, { "epoch": 0.42158532431511847, "grad_norm": 1.343513078448162, "learning_rate": 2.559261979042048e-05, "loss": 0.1685, "step": 5415 }, { "epoch": 0.4216631794073281, "grad_norm": 1.2902332410777777, "learning_rate": 2.5587826946329326e-05, "loss": 0.1636, "step": 5416 }, { "epoch": 0.42174103449953776, "grad_norm": 1.2703130222476564, "learning_rate": 2.558303375414282e-05, "loss": 0.1557, "step": 5417 }, { "epoch": 0.42181888959174735, "grad_norm": 1.3458666233113383, "learning_rate": 2.5578240214159556e-05, "loss": 0.2116, "step": 5418 }, { "epoch": 0.421896744683957, "grad_norm": 1.3408969793365089, "learning_rate": 2.5573446326678145e-05, "loss": 0.2142, "step": 5419 }, { "epoch": 0.42197459977616664, "grad_norm": 1.2836025880564434, "learning_rate": 2.556865209199722e-05, "loss": 0.1802, "step": 5420 }, { "epoch": 0.4220524548683762, "grad_norm": 1.2495805742148727, "learning_rate": 2.5563857510415454e-05, "loss": 0.1752, "step": 5421 }, { "epoch": 0.42213030996058587, "grad_norm": 1.261132213663147, "learning_rate": 2.5559062582231508e-05, "loss": 0.1783, "step": 5422 }, { "epoch": 0.42220816505279546, "grad_norm": 1.3381137510105452, "learning_rate": 2.55542673077441e-05, "loss": 0.164, "step": 5423 }, { "epoch": 0.4222860201450051, "grad_norm": 1.4247985371051906, "learning_rate": 2.5549471687251943e-05, "loss": 0.1973, "step": 5424 }, { "epoch": 0.42236387523721475, "grad_norm": 1.2744279508624716, "learning_rate": 2.5544675721053792e-05, "loss": 0.1819, "step": 5425 }, { "epoch": 0.42244173032942434, "grad_norm": 1.3018158861366464, "learning_rate": 2.5539879409448393e-05, "loss": 0.181, "step": 5426 }, { "epoch": 0.422519585421634, "grad_norm": 1.2743137045854762, "learning_rate": 2.553508275273455e-05, "loss": 0.1541, "step": 5427 }, { "epoch": 0.4225974405138436, "grad_norm": 1.2144197964928003, "learning_rate": 2.5530285751211075e-05, "loss": 0.1916, "step": 5428 }, { "epoch": 0.4226752956060532, "grad_norm": 1.2574604791827804, "learning_rate": 2.552548840517679e-05, "loss": 0.1669, "step": 5429 }, { "epoch": 0.42275315069826286, "grad_norm": 1.1407271495973095, "learning_rate": 2.5520690714930546e-05, "loss": 0.1468, "step": 5430 }, { "epoch": 0.4228310057904725, "grad_norm": 1.2540955729790386, "learning_rate": 2.5515892680771224e-05, "loss": 0.1721, "step": 5431 }, { "epoch": 0.4229088608826821, "grad_norm": 1.233812881363519, "learning_rate": 2.5511094302997713e-05, "loss": 0.1638, "step": 5432 }, { "epoch": 0.42298671597489174, "grad_norm": 1.3546911149823828, "learning_rate": 2.550629558190893e-05, "loss": 0.2137, "step": 5433 }, { "epoch": 0.4230645710671014, "grad_norm": 1.346816184440199, "learning_rate": 2.5501496517803816e-05, "loss": 0.1709, "step": 5434 }, { "epoch": 0.42314242615931097, "grad_norm": 1.2387621647981832, "learning_rate": 2.549669711098133e-05, "loss": 0.1415, "step": 5435 }, { "epoch": 0.4232202812515206, "grad_norm": 1.280970997197908, "learning_rate": 2.5491897361740447e-05, "loss": 0.1766, "step": 5436 }, { "epoch": 0.42329813634373026, "grad_norm": 1.4245446074894377, "learning_rate": 2.548709727038017e-05, "loss": 0.2178, "step": 5437 }, { "epoch": 0.42337599143593985, "grad_norm": 1.3118797461285083, "learning_rate": 2.5482296837199532e-05, "loss": 0.1756, "step": 5438 }, { "epoch": 0.4234538465281495, "grad_norm": 1.4068849259312837, "learning_rate": 2.5477496062497567e-05, "loss": 0.2124, "step": 5439 }, { "epoch": 0.4235317016203591, "grad_norm": 1.3101307150720736, "learning_rate": 2.547269494657335e-05, "loss": 0.1993, "step": 5440 }, { "epoch": 0.4236095567125687, "grad_norm": 1.4047754831046402, "learning_rate": 2.546789348972595e-05, "loss": 0.1955, "step": 5441 }, { "epoch": 0.42368741180477837, "grad_norm": 1.366305504533939, "learning_rate": 2.5463091692254494e-05, "loss": 0.1988, "step": 5442 }, { "epoch": 0.42376526689698796, "grad_norm": 1.2170284499665576, "learning_rate": 2.5458289554458104e-05, "loss": 0.1638, "step": 5443 }, { "epoch": 0.4238431219891976, "grad_norm": 1.2059008012501988, "learning_rate": 2.545348707663593e-05, "loss": 0.181, "step": 5444 }, { "epoch": 0.42392097708140725, "grad_norm": 1.6580578711886438, "learning_rate": 2.544868425908715e-05, "loss": 0.1858, "step": 5445 }, { "epoch": 0.42399883217361684, "grad_norm": 1.2373983262865027, "learning_rate": 2.544388110211094e-05, "loss": 0.1683, "step": 5446 }, { "epoch": 0.4240766872658265, "grad_norm": 1.2176986897264939, "learning_rate": 2.5439077606006536e-05, "loss": 0.1622, "step": 5447 }, { "epoch": 0.4241545423580361, "grad_norm": 1.216494919407033, "learning_rate": 2.543427377107316e-05, "loss": 0.1738, "step": 5448 }, { "epoch": 0.4242323974502457, "grad_norm": 1.3596423602728422, "learning_rate": 2.5429469597610065e-05, "loss": 0.1638, "step": 5449 }, { "epoch": 0.42431025254245536, "grad_norm": 1.4358604843793679, "learning_rate": 2.5424665085916546e-05, "loss": 0.1898, "step": 5450 }, { "epoch": 0.42431025254245536, "eval_loss": 0.022544976323843002, "eval_runtime": 162.5486, "eval_samples_per_second": 17.718, "eval_steps_per_second": 0.634, "step": 5450 }, { "epoch": 0.424388107634665, "grad_norm": 1.3623254432288183, "learning_rate": 2.5419860236291883e-05, "loss": 0.1802, "step": 5451 }, { "epoch": 0.4244659627268746, "grad_norm": 1.2687960335643202, "learning_rate": 2.541505504903541e-05, "loss": 0.1596, "step": 5452 }, { "epoch": 0.42454381781908423, "grad_norm": 1.2019359404228929, "learning_rate": 2.5410249524446453e-05, "loss": 0.1779, "step": 5453 }, { "epoch": 0.4246216729112939, "grad_norm": 1.2097694123428122, "learning_rate": 2.540544366282438e-05, "loss": 0.191, "step": 5454 }, { "epoch": 0.42469952800350347, "grad_norm": 1.2679771452331032, "learning_rate": 2.540063746446858e-05, "loss": 0.1686, "step": 5455 }, { "epoch": 0.4247773830957131, "grad_norm": 1.2902070334432254, "learning_rate": 2.5395830929678447e-05, "loss": 0.1754, "step": 5456 }, { "epoch": 0.4248552381879227, "grad_norm": 1.2274766328329372, "learning_rate": 2.5391024058753414e-05, "loss": 0.1485, "step": 5457 }, { "epoch": 0.42493309328013235, "grad_norm": 1.2730368884612142, "learning_rate": 2.5386216851992923e-05, "loss": 0.1733, "step": 5458 }, { "epoch": 0.425010948372342, "grad_norm": 1.2372082059251905, "learning_rate": 2.5381409309696433e-05, "loss": 0.1486, "step": 5459 }, { "epoch": 0.4250888034645516, "grad_norm": 1.2847486157413612, "learning_rate": 2.537660143216344e-05, "loss": 0.1847, "step": 5460 }, { "epoch": 0.4251666585567612, "grad_norm": 1.3498568691195145, "learning_rate": 2.5371793219693456e-05, "loss": 0.1776, "step": 5461 }, { "epoch": 0.42524451364897087, "grad_norm": 1.342467421127971, "learning_rate": 2.5366984672586e-05, "loss": 0.2079, "step": 5462 }, { "epoch": 0.42532236874118046, "grad_norm": 1.3214680578520743, "learning_rate": 2.5362175791140625e-05, "loss": 0.1833, "step": 5463 }, { "epoch": 0.4254002238333901, "grad_norm": 1.3074105292146843, "learning_rate": 2.5357366575656902e-05, "loss": 0.1953, "step": 5464 }, { "epoch": 0.42547807892559975, "grad_norm": 1.2992138951707648, "learning_rate": 2.5352557026434425e-05, "loss": 0.2018, "step": 5465 }, { "epoch": 0.42555593401780933, "grad_norm": 1.2483324860516287, "learning_rate": 2.5347747143772804e-05, "loss": 0.1699, "step": 5466 }, { "epoch": 0.425633789110019, "grad_norm": 1.319569010628594, "learning_rate": 2.5342936927971675e-05, "loss": 0.1689, "step": 5467 }, { "epoch": 0.4257116442022286, "grad_norm": 1.398314808597151, "learning_rate": 2.5338126379330687e-05, "loss": 0.1678, "step": 5468 }, { "epoch": 0.4257894992944382, "grad_norm": 1.286140430803602, "learning_rate": 2.533331549814951e-05, "loss": 0.1781, "step": 5469 }, { "epoch": 0.42586735438664786, "grad_norm": 1.3547856458006589, "learning_rate": 2.5328504284727856e-05, "loss": 0.1643, "step": 5470 }, { "epoch": 0.4259452094788575, "grad_norm": 1.1908863135575907, "learning_rate": 2.532369273936543e-05, "loss": 0.1679, "step": 5471 }, { "epoch": 0.4260230645710671, "grad_norm": 1.2960306573216367, "learning_rate": 2.531888086236197e-05, "loss": 0.1745, "step": 5472 }, { "epoch": 0.42610091966327673, "grad_norm": 1.3901885453477762, "learning_rate": 2.531406865401723e-05, "loss": 0.2032, "step": 5473 }, { "epoch": 0.4261787747554864, "grad_norm": 1.2949318751636296, "learning_rate": 2.530925611463099e-05, "loss": 0.1857, "step": 5474 }, { "epoch": 0.42625662984769597, "grad_norm": 1.2484745485851165, "learning_rate": 2.5304443244503052e-05, "loss": 0.1813, "step": 5475 }, { "epoch": 0.4263344849399056, "grad_norm": 1.3231558410518973, "learning_rate": 2.529963004393324e-05, "loss": 0.1539, "step": 5476 }, { "epoch": 0.4264123400321152, "grad_norm": 1.2286393775457345, "learning_rate": 2.5294816513221373e-05, "loss": 0.1789, "step": 5477 }, { "epoch": 0.42649019512432484, "grad_norm": 1.236920950818191, "learning_rate": 2.5290002652667335e-05, "loss": 0.1641, "step": 5478 }, { "epoch": 0.4265680502165345, "grad_norm": 1.2872803025609394, "learning_rate": 2.5285188462570995e-05, "loss": 0.169, "step": 5479 }, { "epoch": 0.4266459053087441, "grad_norm": 1.4027425239622695, "learning_rate": 2.5280373943232253e-05, "loss": 0.1951, "step": 5480 }, { "epoch": 0.4267237604009537, "grad_norm": 1.0816107263170318, "learning_rate": 2.5275559094951042e-05, "loss": 0.1519, "step": 5481 }, { "epoch": 0.42680161549316337, "grad_norm": 1.1774536676360738, "learning_rate": 2.527074391802729e-05, "loss": 0.1506, "step": 5482 }, { "epoch": 0.42687947058537296, "grad_norm": 1.3217375679381913, "learning_rate": 2.526592841276097e-05, "loss": 0.2012, "step": 5483 }, { "epoch": 0.4269573256775826, "grad_norm": 1.2537486170662355, "learning_rate": 2.5261112579452062e-05, "loss": 0.1619, "step": 5484 }, { "epoch": 0.42703518076979224, "grad_norm": 1.211781948264191, "learning_rate": 2.5256296418400564e-05, "loss": 0.1675, "step": 5485 }, { "epoch": 0.42711303586200183, "grad_norm": 1.2712993206695138, "learning_rate": 2.525147992990651e-05, "loss": 0.1636, "step": 5486 }, { "epoch": 0.4271908909542115, "grad_norm": 1.1845613936028314, "learning_rate": 2.5246663114269937e-05, "loss": 0.1536, "step": 5487 }, { "epoch": 0.4272687460464211, "grad_norm": 1.3797160097523347, "learning_rate": 2.524184597179092e-05, "loss": 0.2066, "step": 5488 }, { "epoch": 0.4273466011386307, "grad_norm": 1.2379734368522848, "learning_rate": 2.5237028502769537e-05, "loss": 0.1845, "step": 5489 }, { "epoch": 0.42742445623084035, "grad_norm": 1.2245131216306848, "learning_rate": 2.523221070750589e-05, "loss": 0.1406, "step": 5490 }, { "epoch": 0.42750231132305, "grad_norm": 1.4244031665531982, "learning_rate": 2.5227392586300116e-05, "loss": 0.1826, "step": 5491 }, { "epoch": 0.4275801664152596, "grad_norm": 1.3381485908559883, "learning_rate": 2.5222574139452344e-05, "loss": 0.1986, "step": 5492 }, { "epoch": 0.42765802150746923, "grad_norm": 1.2860846341259995, "learning_rate": 2.521775536726276e-05, "loss": 0.1823, "step": 5493 }, { "epoch": 0.4277358765996788, "grad_norm": 1.1929293572072934, "learning_rate": 2.5212936270031538e-05, "loss": 0.157, "step": 5494 }, { "epoch": 0.42781373169188847, "grad_norm": 1.1918901207192552, "learning_rate": 2.520811684805889e-05, "loss": 0.1567, "step": 5495 }, { "epoch": 0.4278915867840981, "grad_norm": 1.2611834126556554, "learning_rate": 2.5203297101645046e-05, "loss": 0.1898, "step": 5496 }, { "epoch": 0.4279694418763077, "grad_norm": 1.2561937639764686, "learning_rate": 2.5198477031090246e-05, "loss": 0.1837, "step": 5497 }, { "epoch": 0.42804729696851734, "grad_norm": 1.252767965874519, "learning_rate": 2.5193656636694767e-05, "loss": 0.1714, "step": 5498 }, { "epoch": 0.428125152060727, "grad_norm": 1.24709190122655, "learning_rate": 2.518883591875889e-05, "loss": 0.1845, "step": 5499 }, { "epoch": 0.4282030071529366, "grad_norm": 1.4147088638397265, "learning_rate": 2.518401487758292e-05, "loss": 0.1902, "step": 5500 }, { "epoch": 0.4282030071529366, "eval_loss": 0.021971365436911583, "eval_runtime": 162.4861, "eval_samples_per_second": 17.725, "eval_steps_per_second": 0.634, "step": 5500 }, { "epoch": 0.4282808622451462, "grad_norm": 1.1529743320834602, "learning_rate": 2.51791935134672e-05, "loss": 0.1531, "step": 5501 }, { "epoch": 0.42835871733735587, "grad_norm": 1.1972611378230633, "learning_rate": 2.517437182671206e-05, "loss": 0.1357, "step": 5502 }, { "epoch": 0.42843657242956545, "grad_norm": 1.286356228723014, "learning_rate": 2.516954981761788e-05, "loss": 0.1584, "step": 5503 }, { "epoch": 0.4285144275217751, "grad_norm": 1.329886288998017, "learning_rate": 2.5164727486485042e-05, "loss": 0.1689, "step": 5504 }, { "epoch": 0.42859228261398474, "grad_norm": 1.1895696461745966, "learning_rate": 2.515990483361396e-05, "loss": 0.1693, "step": 5505 }, { "epoch": 0.42867013770619433, "grad_norm": 1.4317621387404917, "learning_rate": 2.5155081859305063e-05, "loss": 0.1873, "step": 5506 }, { "epoch": 0.428747992798404, "grad_norm": 1.3168487204861914, "learning_rate": 2.51502585638588e-05, "loss": 0.169, "step": 5507 }, { "epoch": 0.4288258478906136, "grad_norm": 1.28867915687632, "learning_rate": 2.5145434947575624e-05, "loss": 0.1538, "step": 5508 }, { "epoch": 0.4289037029828232, "grad_norm": 1.2110060817398476, "learning_rate": 2.5140611010756053e-05, "loss": 0.2084, "step": 5509 }, { "epoch": 0.42898155807503285, "grad_norm": 1.5044203627240038, "learning_rate": 2.5135786753700568e-05, "loss": 0.1989, "step": 5510 }, { "epoch": 0.42905941316724244, "grad_norm": 1.3606366661041078, "learning_rate": 2.513096217670971e-05, "loss": 0.1619, "step": 5511 }, { "epoch": 0.4291372682594521, "grad_norm": 1.3150847713197455, "learning_rate": 2.5126137280084032e-05, "loss": 0.1844, "step": 5512 }, { "epoch": 0.42921512335166173, "grad_norm": 1.1762531236448195, "learning_rate": 2.5121312064124083e-05, "loss": 0.1712, "step": 5513 }, { "epoch": 0.4292929784438713, "grad_norm": 1.22371375196431, "learning_rate": 2.511648652913048e-05, "loss": 0.1655, "step": 5514 }, { "epoch": 0.42937083353608096, "grad_norm": 1.2474989756076729, "learning_rate": 2.5111660675403807e-05, "loss": 0.1538, "step": 5515 }, { "epoch": 0.4294486886282906, "grad_norm": 1.2818077562602581, "learning_rate": 2.5106834503244705e-05, "loss": 0.1796, "step": 5516 }, { "epoch": 0.4295265437205002, "grad_norm": 1.2428871934765007, "learning_rate": 2.5102008012953817e-05, "loss": 0.1604, "step": 5517 }, { "epoch": 0.42960439881270984, "grad_norm": 1.3135231895770274, "learning_rate": 2.509718120483181e-05, "loss": 0.1819, "step": 5518 }, { "epoch": 0.4296822539049195, "grad_norm": 1.2290660727022233, "learning_rate": 2.5092354079179374e-05, "loss": 0.1648, "step": 5519 }, { "epoch": 0.4297601089971291, "grad_norm": 1.340356130869248, "learning_rate": 2.508752663629721e-05, "loss": 0.193, "step": 5520 }, { "epoch": 0.4298379640893387, "grad_norm": 1.2419209014201231, "learning_rate": 2.5082698876486052e-05, "loss": 0.1475, "step": 5521 }, { "epoch": 0.42991581918154836, "grad_norm": 1.1993797794005863, "learning_rate": 2.5077870800046648e-05, "loss": 0.1655, "step": 5522 }, { "epoch": 0.42999367427375795, "grad_norm": 1.3755400183384472, "learning_rate": 2.5073042407279764e-05, "loss": 0.1823, "step": 5523 }, { "epoch": 0.4300715293659676, "grad_norm": 7.563996254986686, "learning_rate": 2.506821369848618e-05, "loss": 0.1507, "step": 5524 }, { "epoch": 0.43014938445817724, "grad_norm": 1.3928270346030782, "learning_rate": 2.5063384673966705e-05, "loss": 0.1727, "step": 5525 }, { "epoch": 0.43022723955038683, "grad_norm": 1.3371446016003876, "learning_rate": 2.5058555334022162e-05, "loss": 0.1583, "step": 5526 }, { "epoch": 0.4303050946425965, "grad_norm": 1.504734036137279, "learning_rate": 2.5053725678953403e-05, "loss": 0.1769, "step": 5527 }, { "epoch": 0.4303829497348061, "grad_norm": 1.2981599767912353, "learning_rate": 2.504889570906129e-05, "loss": 0.1778, "step": 5528 }, { "epoch": 0.4304608048270157, "grad_norm": 1.262165217281902, "learning_rate": 2.50440654246467e-05, "loss": 0.1606, "step": 5529 }, { "epoch": 0.43053865991922535, "grad_norm": 1.4791471187811982, "learning_rate": 2.5039234826010554e-05, "loss": 0.2065, "step": 5530 }, { "epoch": 0.43061651501143494, "grad_norm": 4.519684045938217, "learning_rate": 2.5034403913453754e-05, "loss": 0.1736, "step": 5531 }, { "epoch": 0.4306943701036446, "grad_norm": 1.4197890448267956, "learning_rate": 2.5029572687277256e-05, "loss": 0.1564, "step": 5532 }, { "epoch": 0.43077222519585423, "grad_norm": 1.2973997034111062, "learning_rate": 2.5024741147782025e-05, "loss": 0.1619, "step": 5533 }, { "epoch": 0.4308500802880638, "grad_norm": 2.120809164824697, "learning_rate": 2.5019909295269037e-05, "loss": 0.172, "step": 5534 }, { "epoch": 0.43092793538027346, "grad_norm": 1.9758964525419882, "learning_rate": 2.5015077130039298e-05, "loss": 0.1637, "step": 5535 }, { "epoch": 0.4310057904724831, "grad_norm": 1.3214134887231261, "learning_rate": 2.5010244652393816e-05, "loss": 0.1523, "step": 5536 }, { "epoch": 0.4310836455646927, "grad_norm": 1.2591758751445508, "learning_rate": 2.500541186263365e-05, "loss": 0.176, "step": 5537 }, { "epoch": 0.43116150065690234, "grad_norm": 1.3963047333191827, "learning_rate": 2.5000578761059854e-05, "loss": 0.1756, "step": 5538 }, { "epoch": 0.431239355749112, "grad_norm": 1.2558602403668724, "learning_rate": 2.4995745347973506e-05, "loss": 0.1436, "step": 5539 }, { "epoch": 0.4313172108413216, "grad_norm": 1.1933489202612453, "learning_rate": 2.4990911623675707e-05, "loss": 0.1546, "step": 5540 }, { "epoch": 0.4313950659335312, "grad_norm": 1.3629665633800798, "learning_rate": 2.4986077588467566e-05, "loss": 0.1803, "step": 5541 }, { "epoch": 0.43147292102574086, "grad_norm": 1.3752932842298655, "learning_rate": 2.4981243242650228e-05, "loss": 0.1914, "step": 5542 }, { "epoch": 0.43155077611795045, "grad_norm": 1.2349487429516557, "learning_rate": 2.4976408586524863e-05, "loss": 0.1666, "step": 5543 }, { "epoch": 0.4316286312101601, "grad_norm": 1.2679017018002365, "learning_rate": 2.497157362039262e-05, "loss": 0.1853, "step": 5544 }, { "epoch": 0.43170648630236974, "grad_norm": 1.3475195713140615, "learning_rate": 2.4966738344554714e-05, "loss": 0.1578, "step": 5545 }, { "epoch": 0.43178434139457933, "grad_norm": 1.1741285299053172, "learning_rate": 2.496190275931235e-05, "loss": 0.1644, "step": 5546 }, { "epoch": 0.431862196486789, "grad_norm": 1.2867060175964733, "learning_rate": 2.495706686496678e-05, "loss": 0.1501, "step": 5547 }, { "epoch": 0.43194005157899856, "grad_norm": 1.33957461416245, "learning_rate": 2.4952230661819236e-05, "loss": 0.176, "step": 5548 }, { "epoch": 0.4320179066712082, "grad_norm": 1.1640399348976822, "learning_rate": 2.4947394150170994e-05, "loss": 0.1476, "step": 5549 }, { "epoch": 0.43209576176341785, "grad_norm": 1.200990992181428, "learning_rate": 2.4942557330323357e-05, "loss": 0.1621, "step": 5550 }, { "epoch": 0.43209576176341785, "eval_loss": 0.021080344915390015, "eval_runtime": 162.6996, "eval_samples_per_second": 17.701, "eval_steps_per_second": 0.633, "step": 5550 }, { "epoch": 0.43217361685562744, "grad_norm": 1.287154803066726, "learning_rate": 2.4937720202577628e-05, "loss": 0.1978, "step": 5551 }, { "epoch": 0.4322514719478371, "grad_norm": 1.2712475907262997, "learning_rate": 2.4932882767235146e-05, "loss": 0.19, "step": 5552 }, { "epoch": 0.43232932704004673, "grad_norm": 1.2907868699010279, "learning_rate": 2.4928045024597253e-05, "loss": 0.182, "step": 5553 }, { "epoch": 0.4324071821322563, "grad_norm": 1.2613856571164852, "learning_rate": 2.4923206974965312e-05, "loss": 0.1588, "step": 5554 }, { "epoch": 0.43248503722446596, "grad_norm": 1.1933390077040575, "learning_rate": 2.491836861864072e-05, "loss": 0.1621, "step": 5555 }, { "epoch": 0.4325628923166756, "grad_norm": 1.3429712252153663, "learning_rate": 2.491352995592488e-05, "loss": 0.1608, "step": 5556 }, { "epoch": 0.4326407474088852, "grad_norm": 1.2039195129397664, "learning_rate": 2.4908690987119227e-05, "loss": 0.1518, "step": 5557 }, { "epoch": 0.43271860250109484, "grad_norm": 1.252646270868713, "learning_rate": 2.4903851712525192e-05, "loss": 0.1596, "step": 5558 }, { "epoch": 0.4327964575933045, "grad_norm": 1.2250054603350575, "learning_rate": 2.4899012132444246e-05, "loss": 0.1583, "step": 5559 }, { "epoch": 0.4328743126855141, "grad_norm": 1.250609652281461, "learning_rate": 2.489417224717787e-05, "loss": 0.1479, "step": 5560 }, { "epoch": 0.4329521677777237, "grad_norm": 1.3001604168120788, "learning_rate": 2.4889332057027574e-05, "loss": 0.1497, "step": 5561 }, { "epoch": 0.43303002286993336, "grad_norm": 1.17299310120088, "learning_rate": 2.488449156229487e-05, "loss": 0.1388, "step": 5562 }, { "epoch": 0.43310787796214295, "grad_norm": 1.3291625993844927, "learning_rate": 2.4879650763281302e-05, "loss": 0.1998, "step": 5563 }, { "epoch": 0.4331857330543526, "grad_norm": 1.3549549833680987, "learning_rate": 2.4874809660288422e-05, "loss": 0.1654, "step": 5564 }, { "epoch": 0.4332635881465622, "grad_norm": 1.2400468356114172, "learning_rate": 2.486996825361782e-05, "loss": 0.1701, "step": 5565 }, { "epoch": 0.43334144323877183, "grad_norm": 1.2469298370907995, "learning_rate": 2.486512654357108e-05, "loss": 0.1859, "step": 5566 }, { "epoch": 0.4334192983309815, "grad_norm": 1.2297452013086718, "learning_rate": 2.4860284530449832e-05, "loss": 0.1919, "step": 5567 }, { "epoch": 0.43349715342319106, "grad_norm": 1.2957185163972824, "learning_rate": 2.48554422145557e-05, "loss": 0.1742, "step": 5568 }, { "epoch": 0.4335750085154007, "grad_norm": 1.4835365141111003, "learning_rate": 2.485059959619034e-05, "loss": 0.1451, "step": 5569 }, { "epoch": 0.43365286360761035, "grad_norm": 1.2522772568134397, "learning_rate": 2.4845756675655422e-05, "loss": 0.1633, "step": 5570 }, { "epoch": 0.43373071869981994, "grad_norm": 1.2179885065865286, "learning_rate": 2.4840913453252643e-05, "loss": 0.1952, "step": 5571 }, { "epoch": 0.4338085737920296, "grad_norm": 1.322989269207816, "learning_rate": 2.4836069929283712e-05, "loss": 0.1683, "step": 5572 }, { "epoch": 0.43388642888423923, "grad_norm": 1.2651719612451962, "learning_rate": 2.483122610405036e-05, "loss": 0.1778, "step": 5573 }, { "epoch": 0.4339642839764488, "grad_norm": 1.2830486099434497, "learning_rate": 2.4826381977854318e-05, "loss": 0.1664, "step": 5574 }, { "epoch": 0.43404213906865846, "grad_norm": 1.256400996721286, "learning_rate": 2.4821537550997376e-05, "loss": 0.1846, "step": 5575 }, { "epoch": 0.4341199941608681, "grad_norm": 1.2042039905227127, "learning_rate": 2.4816692823781304e-05, "loss": 0.173, "step": 5576 }, { "epoch": 0.4341978492530777, "grad_norm": 1.362237075292264, "learning_rate": 2.4811847796507907e-05, "loss": 0.1735, "step": 5577 }, { "epoch": 0.43427570434528734, "grad_norm": 1.2491765912167831, "learning_rate": 2.4807002469479013e-05, "loss": 0.1795, "step": 5578 }, { "epoch": 0.434353559437497, "grad_norm": 1.2020327461733726, "learning_rate": 2.480215684299646e-05, "loss": 0.1714, "step": 5579 }, { "epoch": 0.43443141452970657, "grad_norm": 1.4445934108149798, "learning_rate": 2.4797310917362104e-05, "loss": 0.1512, "step": 5580 }, { "epoch": 0.4345092696219162, "grad_norm": 1.2401341422107015, "learning_rate": 2.4792464692877835e-05, "loss": 0.1432, "step": 5581 }, { "epoch": 0.43458712471412586, "grad_norm": 1.2312425101359112, "learning_rate": 2.4787618169845534e-05, "loss": 0.1523, "step": 5582 }, { "epoch": 0.43466497980633545, "grad_norm": 1.2630419829715944, "learning_rate": 2.478277134856713e-05, "loss": 0.1595, "step": 5583 }, { "epoch": 0.4347428348985451, "grad_norm": 1.247939347737137, "learning_rate": 2.4777924229344546e-05, "loss": 0.1886, "step": 5584 }, { "epoch": 0.4348206899907547, "grad_norm": 1.2999872225290685, "learning_rate": 2.477307681247975e-05, "loss": 0.1782, "step": 5585 }, { "epoch": 0.4348985450829643, "grad_norm": 1.243545449603085, "learning_rate": 2.4768229098274698e-05, "loss": 0.1627, "step": 5586 }, { "epoch": 0.43497640017517397, "grad_norm": 1.3656389590074485, "learning_rate": 2.4763381087031384e-05, "loss": 0.2232, "step": 5587 }, { "epoch": 0.43505425526738356, "grad_norm": 1.4721132875090492, "learning_rate": 2.475853277905182e-05, "loss": 0.2195, "step": 5588 }, { "epoch": 0.4351321103595932, "grad_norm": 1.2844277715075412, "learning_rate": 2.4753684174638034e-05, "loss": 0.1887, "step": 5589 }, { "epoch": 0.43520996545180285, "grad_norm": 1.2563102175270462, "learning_rate": 2.4748835274092067e-05, "loss": 0.1705, "step": 5590 }, { "epoch": 0.43528782054401244, "grad_norm": 1.314449486144883, "learning_rate": 2.4743986077715985e-05, "loss": 0.1418, "step": 5591 }, { "epoch": 0.4353656756362221, "grad_norm": 1.2315256197624749, "learning_rate": 2.4739136585811863e-05, "loss": 0.1654, "step": 5592 }, { "epoch": 0.4354435307284317, "grad_norm": 1.2640111479195542, "learning_rate": 2.4734286798681813e-05, "loss": 0.1762, "step": 5593 }, { "epoch": 0.4355213858206413, "grad_norm": 1.2392732172728027, "learning_rate": 2.472943671662795e-05, "loss": 0.1521, "step": 5594 }, { "epoch": 0.43559924091285096, "grad_norm": 1.1336217495299867, "learning_rate": 2.4724586339952406e-05, "loss": 0.1286, "step": 5595 }, { "epoch": 0.4356770960050606, "grad_norm": 1.3453515239813418, "learning_rate": 2.4719735668957344e-05, "loss": 0.1607, "step": 5596 }, { "epoch": 0.4357549510972702, "grad_norm": 1.1359448497850129, "learning_rate": 2.4714884703944928e-05, "loss": 0.1566, "step": 5597 }, { "epoch": 0.43583280618947984, "grad_norm": 1.3020451525872856, "learning_rate": 2.471003344521736e-05, "loss": 0.1828, "step": 5598 }, { "epoch": 0.4359106612816895, "grad_norm": 1.443204140775072, "learning_rate": 2.4705181893076852e-05, "loss": 0.1997, "step": 5599 }, { "epoch": 0.43598851637389907, "grad_norm": 1.2448121965320262, "learning_rate": 2.4700330047825625e-05, "loss": 0.1594, "step": 5600 }, { "epoch": 0.43598851637389907, "eval_loss": 0.020846256986260414, "eval_runtime": 163.0493, "eval_samples_per_second": 17.663, "eval_steps_per_second": 0.632, "step": 5600 }, { "epoch": 0.4360663714661087, "grad_norm": 1.1510999068628576, "learning_rate": 2.4695477909765927e-05, "loss": 0.151, "step": 5601 }, { "epoch": 0.4361442265583183, "grad_norm": 1.1340973465181112, "learning_rate": 2.469062547920003e-05, "loss": 0.125, "step": 5602 }, { "epoch": 0.43622208165052795, "grad_norm": 1.2779569472854453, "learning_rate": 2.4685772756430207e-05, "loss": 0.1703, "step": 5603 }, { "epoch": 0.4362999367427376, "grad_norm": 1.1606305848593, "learning_rate": 2.4680919741758774e-05, "loss": 0.1513, "step": 5604 }, { "epoch": 0.4363777918349472, "grad_norm": 1.1943152412443534, "learning_rate": 2.467606643548804e-05, "loss": 0.1343, "step": 5605 }, { "epoch": 0.4364556469271568, "grad_norm": 1.3149547414366598, "learning_rate": 2.467121283792035e-05, "loss": 0.1691, "step": 5606 }, { "epoch": 0.43653350201936647, "grad_norm": 1.1311828515991058, "learning_rate": 2.4666358949358054e-05, "loss": 0.1371, "step": 5607 }, { "epoch": 0.43661135711157606, "grad_norm": 1.284518069086018, "learning_rate": 2.4661504770103526e-05, "loss": 0.1883, "step": 5608 }, { "epoch": 0.4366892122037857, "grad_norm": 1.334438292733351, "learning_rate": 2.4656650300459165e-05, "loss": 0.2155, "step": 5609 }, { "epoch": 0.43676706729599535, "grad_norm": 1.173954661274428, "learning_rate": 2.465179554072738e-05, "loss": 0.1755, "step": 5610 }, { "epoch": 0.43684492238820494, "grad_norm": 1.1752353953202554, "learning_rate": 2.4646940491210597e-05, "loss": 0.1854, "step": 5611 }, { "epoch": 0.4369227774804146, "grad_norm": 1.348007770604787, "learning_rate": 2.4642085152211266e-05, "loss": 0.1682, "step": 5612 }, { "epoch": 0.4370006325726242, "grad_norm": 1.2194830430120471, "learning_rate": 2.4637229524031842e-05, "loss": 0.1463, "step": 5613 }, { "epoch": 0.4370784876648338, "grad_norm": 1.2307299255845678, "learning_rate": 2.4632373606974826e-05, "loss": 0.1594, "step": 5614 }, { "epoch": 0.43715634275704346, "grad_norm": 1.2159894466241847, "learning_rate": 2.46275174013427e-05, "loss": 0.1639, "step": 5615 }, { "epoch": 0.4372341978492531, "grad_norm": 1.3152578817086187, "learning_rate": 2.4622660907437995e-05, "loss": 0.1806, "step": 5616 }, { "epoch": 0.4373120529414627, "grad_norm": 1.3268799795514323, "learning_rate": 2.4617804125563243e-05, "loss": 0.2057, "step": 5617 }, { "epoch": 0.43738990803367234, "grad_norm": 1.2753823621344502, "learning_rate": 2.4612947056021e-05, "loss": 0.1514, "step": 5618 }, { "epoch": 0.4374677631258819, "grad_norm": 1.3018082689348491, "learning_rate": 2.4608089699113845e-05, "loss": 0.1971, "step": 5619 }, { "epoch": 0.43754561821809157, "grad_norm": 1.1100675474429962, "learning_rate": 2.460323205514435e-05, "loss": 0.1426, "step": 5620 }, { "epoch": 0.4376234733103012, "grad_norm": 1.2755926483226279, "learning_rate": 2.4598374124415137e-05, "loss": 0.1632, "step": 5621 }, { "epoch": 0.4377013284025108, "grad_norm": 1.272562497104133, "learning_rate": 2.4593515907228836e-05, "loss": 0.1632, "step": 5622 }, { "epoch": 0.43777918349472045, "grad_norm": 1.1566002413838992, "learning_rate": 2.458865740388808e-05, "loss": 0.1531, "step": 5623 }, { "epoch": 0.4378570385869301, "grad_norm": 1.237230866357798, "learning_rate": 2.4583798614695545e-05, "loss": 0.1895, "step": 5624 }, { "epoch": 0.4379348936791397, "grad_norm": 1.3514693074270576, "learning_rate": 2.4578939539953894e-05, "loss": 0.171, "step": 5625 }, { "epoch": 0.4380127487713493, "grad_norm": 1.3007679774786154, "learning_rate": 2.4574080179965832e-05, "loss": 0.1793, "step": 5626 }, { "epoch": 0.43809060386355897, "grad_norm": 1.2324984516811888, "learning_rate": 2.4569220535034077e-05, "loss": 0.1548, "step": 5627 }, { "epoch": 0.43816845895576856, "grad_norm": 1.180414101570118, "learning_rate": 2.4564360605461365e-05, "loss": 0.1702, "step": 5628 }, { "epoch": 0.4382463140479782, "grad_norm": 1.3805515423807073, "learning_rate": 2.4559500391550434e-05, "loss": 0.1648, "step": 5629 }, { "epoch": 0.43832416914018785, "grad_norm": 1.407302492525996, "learning_rate": 2.4554639893604063e-05, "loss": 0.2137, "step": 5630 }, { "epoch": 0.43840202423239744, "grad_norm": 1.2734640655237845, "learning_rate": 2.4549779111925035e-05, "loss": 0.15, "step": 5631 }, { "epoch": 0.4384798793246071, "grad_norm": 1.2529848914636115, "learning_rate": 2.4544918046816153e-05, "loss": 0.153, "step": 5632 }, { "epoch": 0.4385577344168167, "grad_norm": 1.1711260635647995, "learning_rate": 2.4540056698580244e-05, "loss": 0.1533, "step": 5633 }, { "epoch": 0.4386355895090263, "grad_norm": 1.211602338619929, "learning_rate": 2.453519506752014e-05, "loss": 0.1449, "step": 5634 }, { "epoch": 0.43871344460123596, "grad_norm": 1.3008912473011318, "learning_rate": 2.45303331539387e-05, "loss": 0.1242, "step": 5635 }, { "epoch": 0.43879129969344555, "grad_norm": 1.275715408269646, "learning_rate": 2.4525470958138796e-05, "loss": 0.1396, "step": 5636 }, { "epoch": 0.4388691547856552, "grad_norm": 1.2271258879656763, "learning_rate": 2.452060848042333e-05, "loss": 0.1312, "step": 5637 }, { "epoch": 0.43894700987786484, "grad_norm": 1.29518088652982, "learning_rate": 2.4515745721095204e-05, "loss": 0.163, "step": 5638 }, { "epoch": 0.4390248649700744, "grad_norm": 1.2248693630398877, "learning_rate": 2.4510882680457342e-05, "loss": 0.1582, "step": 5639 }, { "epoch": 0.43910272006228407, "grad_norm": 1.2690580733829058, "learning_rate": 2.4506019358812688e-05, "loss": 0.1439, "step": 5640 }, { "epoch": 0.4391805751544937, "grad_norm": 1.1436451201504008, "learning_rate": 2.4501155756464212e-05, "loss": 0.1627, "step": 5641 }, { "epoch": 0.4392584302467033, "grad_norm": 1.2584685795199786, "learning_rate": 2.449629187371489e-05, "loss": 0.1374, "step": 5642 }, { "epoch": 0.43933628533891295, "grad_norm": 1.346195094792744, "learning_rate": 2.4491427710867727e-05, "loss": 0.187, "step": 5643 }, { "epoch": 0.4394141404311226, "grad_norm": 1.1282501406987209, "learning_rate": 2.4486563268225714e-05, "loss": 0.1172, "step": 5644 }, { "epoch": 0.4394919955233322, "grad_norm": 1.2144440456733545, "learning_rate": 2.4481698546091907e-05, "loss": 0.1663, "step": 5645 }, { "epoch": 0.4395698506155418, "grad_norm": 1.294551436670902, "learning_rate": 2.4476833544769344e-05, "loss": 0.1769, "step": 5646 }, { "epoch": 0.43964770570775147, "grad_norm": 1.2242402092684324, "learning_rate": 2.44719682645611e-05, "loss": 0.1511, "step": 5647 }, { "epoch": 0.43972556079996106, "grad_norm": 1.1550673478827977, "learning_rate": 2.4467102705770244e-05, "loss": 0.17, "step": 5648 }, { "epoch": 0.4398034158921707, "grad_norm": 1.3701115102452657, "learning_rate": 2.446223686869989e-05, "loss": 0.2359, "step": 5649 }, { "epoch": 0.43988127098438035, "grad_norm": 1.2646343381684804, "learning_rate": 2.4457370753653154e-05, "loss": 0.1765, "step": 5650 }, { "epoch": 0.43988127098438035, "eval_loss": 0.020341912284493446, "eval_runtime": 163.4183, "eval_samples_per_second": 17.623, "eval_steps_per_second": 0.63, "step": 5650 }, { "epoch": 0.43995912607658993, "grad_norm": 1.2643674371875984, "learning_rate": 2.4452504360933168e-05, "loss": 0.1818, "step": 5651 }, { "epoch": 0.4400369811687996, "grad_norm": 1.247121900500617, "learning_rate": 2.4447637690843098e-05, "loss": 0.1655, "step": 5652 }, { "epoch": 0.4401148362610092, "grad_norm": 1.2668777371705169, "learning_rate": 2.4442770743686102e-05, "loss": 0.1693, "step": 5653 }, { "epoch": 0.4401926913532188, "grad_norm": 1.3155757192716055, "learning_rate": 2.4437903519765366e-05, "loss": 0.1658, "step": 5654 }, { "epoch": 0.44027054644542846, "grad_norm": 1.2545792170519632, "learning_rate": 2.4433036019384105e-05, "loss": 0.1665, "step": 5655 }, { "epoch": 0.44034840153763805, "grad_norm": 1.20554471411973, "learning_rate": 2.442816824284554e-05, "loss": 0.1518, "step": 5656 }, { "epoch": 0.4404262566298477, "grad_norm": 1.3099473827331929, "learning_rate": 2.4423300190452902e-05, "loss": 0.1927, "step": 5657 }, { "epoch": 0.44050411172205733, "grad_norm": 1.1446811317808512, "learning_rate": 2.441843186250946e-05, "loss": 0.151, "step": 5658 }, { "epoch": 0.4405819668142669, "grad_norm": 1.2429431798502382, "learning_rate": 2.4413563259318468e-05, "loss": 0.1638, "step": 5659 }, { "epoch": 0.44065982190647657, "grad_norm": 1.2753794456209067, "learning_rate": 2.4408694381183243e-05, "loss": 0.182, "step": 5660 }, { "epoch": 0.4407376769986862, "grad_norm": 1.1959878898731662, "learning_rate": 2.440382522840708e-05, "loss": 0.1482, "step": 5661 }, { "epoch": 0.4408155320908958, "grad_norm": 1.136486453402851, "learning_rate": 2.4398955801293305e-05, "loss": 0.1692, "step": 5662 }, { "epoch": 0.44089338718310545, "grad_norm": 1.2510230816609595, "learning_rate": 2.4394086100145257e-05, "loss": 0.1689, "step": 5663 }, { "epoch": 0.4409712422753151, "grad_norm": 1.2244737546685687, "learning_rate": 2.4389216125266297e-05, "loss": 0.1813, "step": 5664 }, { "epoch": 0.4410490973675247, "grad_norm": 1.1602970160244155, "learning_rate": 2.4384345876959807e-05, "loss": 0.1483, "step": 5665 }, { "epoch": 0.4411269524597343, "grad_norm": 1.3207991537814168, "learning_rate": 2.4379475355529175e-05, "loss": 0.1499, "step": 5666 }, { "epoch": 0.44120480755194397, "grad_norm": 1.1905490158385432, "learning_rate": 2.4374604561277817e-05, "loss": 0.1626, "step": 5667 }, { "epoch": 0.44128266264415356, "grad_norm": 1.1598542168051085, "learning_rate": 2.4369733494509156e-05, "loss": 0.1402, "step": 5668 }, { "epoch": 0.4413605177363632, "grad_norm": 1.2519870027166409, "learning_rate": 2.4364862155526634e-05, "loss": 0.1817, "step": 5669 }, { "epoch": 0.44143837282857284, "grad_norm": 1.2411260012769956, "learning_rate": 2.435999054463372e-05, "loss": 0.1473, "step": 5670 }, { "epoch": 0.44151622792078243, "grad_norm": 1.229982705481948, "learning_rate": 2.435511866213389e-05, "loss": 0.1797, "step": 5671 }, { "epoch": 0.4415940830129921, "grad_norm": 1.2490880737080792, "learning_rate": 2.435024650833064e-05, "loss": 0.1337, "step": 5672 }, { "epoch": 0.44167193810520167, "grad_norm": 1.254707508165752, "learning_rate": 2.434537408352748e-05, "loss": 0.1566, "step": 5673 }, { "epoch": 0.4417497931974113, "grad_norm": 1.2213297856214123, "learning_rate": 2.4340501388027932e-05, "loss": 0.1635, "step": 5674 }, { "epoch": 0.44182764828962096, "grad_norm": 1.2120449090186258, "learning_rate": 2.4335628422135554e-05, "loss": 0.1466, "step": 5675 }, { "epoch": 0.44190550338183054, "grad_norm": 1.3038244916384483, "learning_rate": 2.433075518615391e-05, "loss": 0.1624, "step": 5676 }, { "epoch": 0.4419833584740402, "grad_norm": 1.245773825103816, "learning_rate": 2.432588168038657e-05, "loss": 0.1354, "step": 5677 }, { "epoch": 0.44206121356624983, "grad_norm": 1.1781231943983095, "learning_rate": 2.432100790513713e-05, "loss": 0.158, "step": 5678 }, { "epoch": 0.4421390686584594, "grad_norm": 1.213890390907053, "learning_rate": 2.4316133860709215e-05, "loss": 0.1686, "step": 5679 }, { "epoch": 0.44221692375066907, "grad_norm": 1.2217295625689566, "learning_rate": 2.4311259547406443e-05, "loss": 0.1592, "step": 5680 }, { "epoch": 0.4422947788428787, "grad_norm": 1.1709548795809626, "learning_rate": 2.430638496553247e-05, "loss": 0.1335, "step": 5681 }, { "epoch": 0.4423726339350883, "grad_norm": 1.3616755131851217, "learning_rate": 2.430151011539095e-05, "loss": 0.1851, "step": 5682 }, { "epoch": 0.44245048902729794, "grad_norm": 1.2316336925900349, "learning_rate": 2.4296634997285574e-05, "loss": 0.1723, "step": 5683 }, { "epoch": 0.4425283441195076, "grad_norm": 1.0969047202458, "learning_rate": 2.4291759611520035e-05, "loss": 0.1514, "step": 5684 }, { "epoch": 0.4426061992117172, "grad_norm": 1.2555800469807954, "learning_rate": 2.4286883958398044e-05, "loss": 0.161, "step": 5685 }, { "epoch": 0.4426840543039268, "grad_norm": 1.2237565399563466, "learning_rate": 2.428200803822333e-05, "loss": 0.1468, "step": 5686 }, { "epoch": 0.44276190939613647, "grad_norm": 1.1752544915781236, "learning_rate": 2.4277131851299642e-05, "loss": 0.1487, "step": 5687 }, { "epoch": 0.44283976448834605, "grad_norm": 1.1406478423412627, "learning_rate": 2.4272255397930744e-05, "loss": 0.1481, "step": 5688 }, { "epoch": 0.4429176195805557, "grad_norm": 1.2077441592926115, "learning_rate": 2.426737867842042e-05, "loss": 0.138, "step": 5689 }, { "epoch": 0.4429954746727653, "grad_norm": 1.2191593159453071, "learning_rate": 2.426250169307246e-05, "loss": 0.1568, "step": 5690 }, { "epoch": 0.44307332976497493, "grad_norm": 1.2849637231490274, "learning_rate": 2.4257624442190682e-05, "loss": 0.1537, "step": 5691 }, { "epoch": 0.4431511848571846, "grad_norm": 1.3958221454833515, "learning_rate": 2.4252746926078907e-05, "loss": 0.196, "step": 5692 }, { "epoch": 0.44322903994939417, "grad_norm": 1.1808294059619857, "learning_rate": 2.4247869145040993e-05, "loss": 0.1348, "step": 5693 }, { "epoch": 0.4433068950416038, "grad_norm": 1.1266719469256459, "learning_rate": 2.42429910993808e-05, "loss": 0.1341, "step": 5694 }, { "epoch": 0.44338475013381345, "grad_norm": 1.2389975777408408, "learning_rate": 2.4238112789402205e-05, "loss": 0.1577, "step": 5695 }, { "epoch": 0.44346260522602304, "grad_norm": 1.269493905784983, "learning_rate": 2.42332342154091e-05, "loss": 0.1754, "step": 5696 }, { "epoch": 0.4435404603182327, "grad_norm": 1.2885199895549047, "learning_rate": 2.422835537770541e-05, "loss": 0.1696, "step": 5697 }, { "epoch": 0.44361831541044233, "grad_norm": 1.3254858630977404, "learning_rate": 2.4223476276595045e-05, "loss": 0.1777, "step": 5698 }, { "epoch": 0.4436961705026519, "grad_norm": 1.1550494466276817, "learning_rate": 2.4218596912381967e-05, "loss": 0.1491, "step": 5699 }, { "epoch": 0.44377402559486157, "grad_norm": 1.1890929660996554, "learning_rate": 2.4213717285370128e-05, "loss": 0.1863, "step": 5700 }, { "epoch": 0.44377402559486157, "eval_loss": 0.019963828846812248, "eval_runtime": 162.5569, "eval_samples_per_second": 17.717, "eval_steps_per_second": 0.634, "step": 5700 }, { "epoch": 0.4438518806870712, "grad_norm": 1.144634897620754, "learning_rate": 2.4208837395863506e-05, "loss": 0.1521, "step": 5701 }, { "epoch": 0.4439297357792808, "grad_norm": 1.244922244217417, "learning_rate": 2.42039572441661e-05, "loss": 0.1632, "step": 5702 }, { "epoch": 0.44400759087149044, "grad_norm": 1.2513584358370295, "learning_rate": 2.419907683058192e-05, "loss": 0.1599, "step": 5703 }, { "epoch": 0.4440854459637001, "grad_norm": 1.2699181956530057, "learning_rate": 2.4194196155414983e-05, "loss": 0.1781, "step": 5704 }, { "epoch": 0.4441633010559097, "grad_norm": 1.168582914501567, "learning_rate": 2.418931521896935e-05, "loss": 0.1322, "step": 5705 }, { "epoch": 0.4442411561481193, "grad_norm": 1.1656088490335819, "learning_rate": 2.4184434021549065e-05, "loss": 0.1648, "step": 5706 }, { "epoch": 0.44431901124032896, "grad_norm": 1.3804078744551895, "learning_rate": 2.417955256345821e-05, "loss": 0.1699, "step": 5707 }, { "epoch": 0.44439686633253855, "grad_norm": 1.2208716090332115, "learning_rate": 2.4174670845000873e-05, "loss": 0.1589, "step": 5708 }, { "epoch": 0.4444747214247482, "grad_norm": 1.1866186929897897, "learning_rate": 2.4169788866481167e-05, "loss": 0.1472, "step": 5709 }, { "epoch": 0.4445525765169578, "grad_norm": 1.153012680448571, "learning_rate": 2.4164906628203215e-05, "loss": 0.1249, "step": 5710 }, { "epoch": 0.44463043160916743, "grad_norm": 1.088303607722662, "learning_rate": 2.416002413047116e-05, "loss": 0.1489, "step": 5711 }, { "epoch": 0.4447082867013771, "grad_norm": 1.2922346215407579, "learning_rate": 2.4155141373589154e-05, "loss": 0.1337, "step": 5712 }, { "epoch": 0.44478614179358666, "grad_norm": 1.2396936619709842, "learning_rate": 2.4150258357861364e-05, "loss": 0.1682, "step": 5713 }, { "epoch": 0.4448639968857963, "grad_norm": 1.2991288034655362, "learning_rate": 2.4145375083592e-05, "loss": 0.1514, "step": 5714 }, { "epoch": 0.44494185197800595, "grad_norm": 1.0995178770242358, "learning_rate": 2.4140491551085242e-05, "loss": 0.1387, "step": 5715 }, { "epoch": 0.44501970707021554, "grad_norm": 1.1387254090622099, "learning_rate": 2.4135607760645323e-05, "loss": 0.1237, "step": 5716 }, { "epoch": 0.4450975621624252, "grad_norm": 1.1739964037199797, "learning_rate": 2.4130723712576484e-05, "loss": 0.1561, "step": 5717 }, { "epoch": 0.44517541725463483, "grad_norm": 1.166437945259963, "learning_rate": 2.412583940718297e-05, "loss": 0.1294, "step": 5718 }, { "epoch": 0.4452532723468444, "grad_norm": 1.2953563178521155, "learning_rate": 2.412095484476906e-05, "loss": 0.1555, "step": 5719 }, { "epoch": 0.44533112743905406, "grad_norm": 1.2625262711766934, "learning_rate": 2.4116070025639032e-05, "loss": 0.1546, "step": 5720 }, { "epoch": 0.4454089825312637, "grad_norm": 1.1776340181733287, "learning_rate": 2.411118495009718e-05, "loss": 0.1471, "step": 5721 }, { "epoch": 0.4454868376234733, "grad_norm": 1.2149340093982293, "learning_rate": 2.4106299618447836e-05, "loss": 0.1777, "step": 5722 }, { "epoch": 0.44556469271568294, "grad_norm": 1.159916180898161, "learning_rate": 2.410141403099533e-05, "loss": 0.1701, "step": 5723 }, { "epoch": 0.4456425478078926, "grad_norm": 1.207923042445042, "learning_rate": 2.409652818804401e-05, "loss": 0.1513, "step": 5724 }, { "epoch": 0.4457204029001022, "grad_norm": 1.1345504914500866, "learning_rate": 2.4091642089898232e-05, "loss": 0.1667, "step": 5725 }, { "epoch": 0.4457982579923118, "grad_norm": 1.0020040029819928, "learning_rate": 2.4086755736862385e-05, "loss": 0.1394, "step": 5726 }, { "epoch": 0.4458761130845214, "grad_norm": 1.2017808087539816, "learning_rate": 2.4081869129240866e-05, "loss": 0.165, "step": 5727 }, { "epoch": 0.44595396817673105, "grad_norm": 1.1467325690860117, "learning_rate": 2.4076982267338087e-05, "loss": 0.1277, "step": 5728 }, { "epoch": 0.4460318232689407, "grad_norm": 1.2805794296585093, "learning_rate": 2.407209515145848e-05, "loss": 0.1648, "step": 5729 }, { "epoch": 0.4461096783611503, "grad_norm": 1.2030003177588393, "learning_rate": 2.406720778190648e-05, "loss": 0.1601, "step": 5730 }, { "epoch": 0.44618753345335993, "grad_norm": 1.0784754673778767, "learning_rate": 2.4062320158986556e-05, "loss": 0.1561, "step": 5731 }, { "epoch": 0.4462653885455696, "grad_norm": 1.2022845910829842, "learning_rate": 2.4057432283003176e-05, "loss": 0.1684, "step": 5732 }, { "epoch": 0.44634324363777916, "grad_norm": 1.1485760472313178, "learning_rate": 2.4052544154260845e-05, "loss": 0.1365, "step": 5733 }, { "epoch": 0.4464210987299888, "grad_norm": 1.2289362743642593, "learning_rate": 2.4047655773064057e-05, "loss": 0.1647, "step": 5734 }, { "epoch": 0.44649895382219845, "grad_norm": 1.230910137085632, "learning_rate": 2.4042767139717342e-05, "loss": 0.1807, "step": 5735 }, { "epoch": 0.44657680891440804, "grad_norm": 1.1734895584469323, "learning_rate": 2.4037878254525227e-05, "loss": 0.1232, "step": 5736 }, { "epoch": 0.4466546640066177, "grad_norm": 1.1668895711508802, "learning_rate": 2.4032989117792286e-05, "loss": 0.1365, "step": 5737 }, { "epoch": 0.44673251909882733, "grad_norm": 1.2498304069527424, "learning_rate": 2.402809972982308e-05, "loss": 0.1488, "step": 5738 }, { "epoch": 0.4468103741910369, "grad_norm": 1.1823851011544553, "learning_rate": 2.40232100909222e-05, "loss": 0.1648, "step": 5739 }, { "epoch": 0.44688822928324656, "grad_norm": 1.253089255330596, "learning_rate": 2.401832020139423e-05, "loss": 0.1547, "step": 5740 }, { "epoch": 0.4469660843754562, "grad_norm": 1.1917090994649737, "learning_rate": 2.40134300615438e-05, "loss": 0.1442, "step": 5741 }, { "epoch": 0.4470439394676658, "grad_norm": 1.2424242973587638, "learning_rate": 2.4008539671675547e-05, "loss": 0.1622, "step": 5742 }, { "epoch": 0.44712179455987544, "grad_norm": 1.20091988649652, "learning_rate": 2.400364903209412e-05, "loss": 0.1455, "step": 5743 }, { "epoch": 0.44719964965208503, "grad_norm": 1.2495153371083256, "learning_rate": 2.399875814310417e-05, "loss": 0.1789, "step": 5744 }, { "epoch": 0.4472775047442947, "grad_norm": 1.1912752847314876, "learning_rate": 2.399386700501038e-05, "loss": 0.1877, "step": 5745 }, { "epoch": 0.4473553598365043, "grad_norm": 1.331767431268742, "learning_rate": 2.398897561811745e-05, "loss": 0.1954, "step": 5746 }, { "epoch": 0.4474332149287139, "grad_norm": 1.2413026886173506, "learning_rate": 2.3984083982730092e-05, "loss": 0.1612, "step": 5747 }, { "epoch": 0.44751107002092355, "grad_norm": 1.2325150854354634, "learning_rate": 2.3979192099153034e-05, "loss": 0.1983, "step": 5748 }, { "epoch": 0.4475889251131332, "grad_norm": 1.1337708798237576, "learning_rate": 2.3974299967691e-05, "loss": 0.1402, "step": 5749 }, { "epoch": 0.4476667802053428, "grad_norm": 1.2840355906121423, "learning_rate": 2.396940758864877e-05, "loss": 0.1439, "step": 5750 }, { "epoch": 0.4476667802053428, "eval_loss": 0.019917353987693787, "eval_runtime": 162.6902, "eval_samples_per_second": 17.702, "eval_steps_per_second": 0.633, "step": 5750 }, { "epoch": 0.44774463529755243, "grad_norm": 1.1917566241514246, "learning_rate": 2.3964514962331095e-05, "loss": 0.1564, "step": 5751 }, { "epoch": 0.4478224903897621, "grad_norm": 1.201262333326006, "learning_rate": 2.395962208904278e-05, "loss": 0.1861, "step": 5752 }, { "epoch": 0.44790034548197166, "grad_norm": 1.230190903473204, "learning_rate": 2.395472896908862e-05, "loss": 0.1584, "step": 5753 }, { "epoch": 0.4479782005741813, "grad_norm": 1.1585323733538824, "learning_rate": 2.3949835602773427e-05, "loss": 0.1599, "step": 5754 }, { "epoch": 0.44805605566639095, "grad_norm": 1.2633352582689206, "learning_rate": 2.394494199040205e-05, "loss": 0.1473, "step": 5755 }, { "epoch": 0.44813391075860054, "grad_norm": 1.1927521290784882, "learning_rate": 2.394004813227933e-05, "loss": 0.1867, "step": 5756 }, { "epoch": 0.4482117658508102, "grad_norm": 1.2423737030410351, "learning_rate": 2.393515402871013e-05, "loss": 0.161, "step": 5757 }, { "epoch": 0.44828962094301983, "grad_norm": 1.2063383971120334, "learning_rate": 2.3930259679999325e-05, "loss": 0.1391, "step": 5758 }, { "epoch": 0.4483674760352294, "grad_norm": 1.2067712551213168, "learning_rate": 2.392536508645182e-05, "loss": 0.1586, "step": 5759 }, { "epoch": 0.44844533112743906, "grad_norm": 1.3786696912739878, "learning_rate": 2.3920470248372524e-05, "loss": 0.1765, "step": 5760 }, { "epoch": 0.4485231862196487, "grad_norm": 1.1446260118546256, "learning_rate": 2.3915575166066356e-05, "loss": 0.132, "step": 5761 }, { "epoch": 0.4486010413118583, "grad_norm": 1.2870449984301413, "learning_rate": 2.3910679839838257e-05, "loss": 0.1478, "step": 5762 }, { "epoch": 0.44867889640406794, "grad_norm": 1.1645984024959293, "learning_rate": 2.3905784269993192e-05, "loss": 0.1374, "step": 5763 }, { "epoch": 0.44875675149627753, "grad_norm": 1.21850400561062, "learning_rate": 2.3900888456836118e-05, "loss": 0.1612, "step": 5764 }, { "epoch": 0.4488346065884872, "grad_norm": 1.1808122642042165, "learning_rate": 2.3895992400672037e-05, "loss": 0.1424, "step": 5765 }, { "epoch": 0.4489124616806968, "grad_norm": 1.1997307462893962, "learning_rate": 2.3891096101805938e-05, "loss": 0.1627, "step": 5766 }, { "epoch": 0.4489903167729064, "grad_norm": 1.157016458131302, "learning_rate": 2.3886199560542846e-05, "loss": 0.1691, "step": 5767 }, { "epoch": 0.44906817186511605, "grad_norm": 1.2076799570352303, "learning_rate": 2.3881302777187783e-05, "loss": 0.1544, "step": 5768 }, { "epoch": 0.4491460269573257, "grad_norm": 1.1535418683891758, "learning_rate": 2.38764057520458e-05, "loss": 0.16, "step": 5769 }, { "epoch": 0.4492238820495353, "grad_norm": 1.2066023660074179, "learning_rate": 2.3871508485421963e-05, "loss": 0.1614, "step": 5770 }, { "epoch": 0.44930173714174493, "grad_norm": 1.1378540842985665, "learning_rate": 2.386661097762135e-05, "loss": 0.1499, "step": 5771 }, { "epoch": 0.4493795922339546, "grad_norm": 1.295999854499425, "learning_rate": 2.386171322894904e-05, "loss": 0.1433, "step": 5772 }, { "epoch": 0.44945744732616416, "grad_norm": 1.37767655431953, "learning_rate": 2.3856815239710147e-05, "loss": 0.2491, "step": 5773 }, { "epoch": 0.4495353024183738, "grad_norm": 1.185143220412184, "learning_rate": 2.3851917010209794e-05, "loss": 0.1532, "step": 5774 }, { "epoch": 0.44961315751058345, "grad_norm": 1.2134513702924732, "learning_rate": 2.3847018540753118e-05, "loss": 0.1707, "step": 5775 }, { "epoch": 0.44969101260279304, "grad_norm": 1.1725332357147422, "learning_rate": 2.3842119831645274e-05, "loss": 0.1256, "step": 5776 }, { "epoch": 0.4497688676950027, "grad_norm": 1.229254380435393, "learning_rate": 2.3837220883191416e-05, "loss": 0.1596, "step": 5777 }, { "epoch": 0.4498467227872123, "grad_norm": 1.2333964475874208, "learning_rate": 2.3832321695696735e-05, "loss": 0.1542, "step": 5778 }, { "epoch": 0.4499245778794219, "grad_norm": 1.1664774947618846, "learning_rate": 2.3827422269466434e-05, "loss": 0.1671, "step": 5779 }, { "epoch": 0.45000243297163156, "grad_norm": 1.1932986122830287, "learning_rate": 2.3822522604805703e-05, "loss": 0.1557, "step": 5780 }, { "epoch": 0.45008028806384115, "grad_norm": 1.2663998962000937, "learning_rate": 2.3817622702019793e-05, "loss": 0.1554, "step": 5781 }, { "epoch": 0.4501581431560508, "grad_norm": 1.2034314495065999, "learning_rate": 2.381272256141393e-05, "loss": 0.1645, "step": 5782 }, { "epoch": 0.45023599824826044, "grad_norm": 1.2649030558557233, "learning_rate": 2.380782218329337e-05, "loss": 0.1798, "step": 5783 }, { "epoch": 0.45031385334047, "grad_norm": 1.3134522371772706, "learning_rate": 2.3802921567963387e-05, "loss": 0.1843, "step": 5784 }, { "epoch": 0.45039170843267967, "grad_norm": 1.1421027720245016, "learning_rate": 2.379802071572926e-05, "loss": 0.1343, "step": 5785 }, { "epoch": 0.4504695635248893, "grad_norm": 1.205281617605235, "learning_rate": 2.3793119626896307e-05, "loss": 0.1482, "step": 5786 }, { "epoch": 0.4505474186170989, "grad_norm": 1.3140964401771298, "learning_rate": 2.3788218301769822e-05, "loss": 0.1708, "step": 5787 }, { "epoch": 0.45062527370930855, "grad_norm": 1.2670409073250732, "learning_rate": 2.378331674065515e-05, "loss": 0.1557, "step": 5788 }, { "epoch": 0.4507031288015182, "grad_norm": 1.211748921875922, "learning_rate": 2.3778414943857623e-05, "loss": 0.1822, "step": 5789 }, { "epoch": 0.4507809838937278, "grad_norm": 1.1599383531351368, "learning_rate": 2.3773512911682606e-05, "loss": 0.1452, "step": 5790 }, { "epoch": 0.4508588389859374, "grad_norm": 1.196635707049334, "learning_rate": 2.3768610644435476e-05, "loss": 0.1488, "step": 5791 }, { "epoch": 0.45093669407814707, "grad_norm": 1.242493450893305, "learning_rate": 2.376370814242161e-05, "loss": 0.1568, "step": 5792 }, { "epoch": 0.45101454917035666, "grad_norm": 1.271543749648738, "learning_rate": 2.3758805405946416e-05, "loss": 0.1641, "step": 5793 }, { "epoch": 0.4510924042625663, "grad_norm": 1.1505955170682494, "learning_rate": 2.375390243531532e-05, "loss": 0.1481, "step": 5794 }, { "epoch": 0.45117025935477595, "grad_norm": 1.1635466937899586, "learning_rate": 2.3748999230833744e-05, "loss": 0.1332, "step": 5795 }, { "epoch": 0.45124811444698554, "grad_norm": 1.1635678596305115, "learning_rate": 2.374409579280714e-05, "loss": 0.1374, "step": 5796 }, { "epoch": 0.4513259695391952, "grad_norm": 1.12860280412417, "learning_rate": 2.3739192121540967e-05, "loss": 0.1448, "step": 5797 }, { "epoch": 0.45140382463140477, "grad_norm": 1.1743675622295529, "learning_rate": 2.373428821734069e-05, "loss": 0.1663, "step": 5798 }, { "epoch": 0.4514816797236144, "grad_norm": 1.1729977641556486, "learning_rate": 2.3729384080511823e-05, "loss": 0.1492, "step": 5799 }, { "epoch": 0.45155953481582406, "grad_norm": 1.1855986417294793, "learning_rate": 2.3724479711359854e-05, "loss": 0.1431, "step": 5800 }, { "epoch": 0.45155953481582406, "eval_loss": 0.019237587228417397, "eval_runtime": 166.7591, "eval_samples_per_second": 17.27, "eval_steps_per_second": 0.618, "step": 5800 }, { "epoch": 0.45163738990803365, "grad_norm": 1.1738530372065359, "learning_rate": 2.3719575110190303e-05, "loss": 0.1399, "step": 5801 }, { "epoch": 0.4517152450002433, "grad_norm": 1.1765143736799564, "learning_rate": 2.371467027730871e-05, "loss": 0.1487, "step": 5802 }, { "epoch": 0.45179310009245294, "grad_norm": 1.165113618769785, "learning_rate": 2.370976521302061e-05, "loss": 0.1284, "step": 5803 }, { "epoch": 0.4518709551846625, "grad_norm": 1.3463703596304304, "learning_rate": 2.3704859917631583e-05, "loss": 0.1613, "step": 5804 }, { "epoch": 0.45194881027687217, "grad_norm": 1.2413172509133699, "learning_rate": 2.3699954391447197e-05, "loss": 0.1355, "step": 5805 }, { "epoch": 0.4520266653690818, "grad_norm": 1.2179764858350883, "learning_rate": 2.3695048634773047e-05, "loss": 0.1639, "step": 5806 }, { "epoch": 0.4521045204612914, "grad_norm": 1.225239410715354, "learning_rate": 2.369014264791473e-05, "loss": 0.2027, "step": 5807 }, { "epoch": 0.45218237555350105, "grad_norm": 1.1598043296686162, "learning_rate": 2.368523643117787e-05, "loss": 0.1408, "step": 5808 }, { "epoch": 0.4522602306457107, "grad_norm": 1.2288568684372185, "learning_rate": 2.36803299848681e-05, "loss": 0.143, "step": 5809 }, { "epoch": 0.4523380857379203, "grad_norm": 1.1637767851113296, "learning_rate": 2.3675423309291086e-05, "loss": 0.154, "step": 5810 }, { "epoch": 0.4524159408301299, "grad_norm": 1.1839239580324021, "learning_rate": 2.3670516404752465e-05, "loss": 0.1408, "step": 5811 }, { "epoch": 0.45249379592233957, "grad_norm": 1.3110726096516756, "learning_rate": 2.366560927155792e-05, "loss": 0.1437, "step": 5812 }, { "epoch": 0.45257165101454916, "grad_norm": 1.2027225186452974, "learning_rate": 2.3660701910013152e-05, "loss": 0.1615, "step": 5813 }, { "epoch": 0.4526495061067588, "grad_norm": 1.2947775538083104, "learning_rate": 2.3655794320423868e-05, "loss": 0.1614, "step": 5814 }, { "epoch": 0.45272736119896845, "grad_norm": 1.228836113783923, "learning_rate": 2.3650886503095777e-05, "loss": 0.1521, "step": 5815 }, { "epoch": 0.45280521629117804, "grad_norm": 1.239694705071836, "learning_rate": 2.364597845833461e-05, "loss": 0.1503, "step": 5816 }, { "epoch": 0.4528830713833877, "grad_norm": 1.221663405353854, "learning_rate": 2.3641070186446132e-05, "loss": 0.1344, "step": 5817 }, { "epoch": 0.45296092647559727, "grad_norm": 1.3483558552472323, "learning_rate": 2.363616168773609e-05, "loss": 0.1479, "step": 5818 }, { "epoch": 0.4530387815678069, "grad_norm": 1.1354859465370186, "learning_rate": 2.3631252962510278e-05, "loss": 0.1439, "step": 5819 }, { "epoch": 0.45311663666001656, "grad_norm": 1.1369403849914106, "learning_rate": 2.3626344011074466e-05, "loss": 0.1818, "step": 5820 }, { "epoch": 0.45319449175222615, "grad_norm": 1.218421936724782, "learning_rate": 2.3621434833734466e-05, "loss": 0.134, "step": 5821 }, { "epoch": 0.4532723468444358, "grad_norm": 1.2389152377159687, "learning_rate": 2.3616525430796097e-05, "loss": 0.1373, "step": 5822 }, { "epoch": 0.45335020193664544, "grad_norm": 1.3085737945092777, "learning_rate": 2.36116158025652e-05, "loss": 0.1699, "step": 5823 }, { "epoch": 0.453428057028855, "grad_norm": 1.28503226022632, "learning_rate": 2.360670594934761e-05, "loss": 0.1489, "step": 5824 }, { "epoch": 0.45350591212106467, "grad_norm": 1.2201087302525666, "learning_rate": 2.360179587144919e-05, "loss": 0.1738, "step": 5825 }, { "epoch": 0.4535837672132743, "grad_norm": 1.2363773438861683, "learning_rate": 2.3596885569175817e-05, "loss": 0.1872, "step": 5826 }, { "epoch": 0.4536616223054839, "grad_norm": 1.0550793819600328, "learning_rate": 2.3591975042833388e-05, "loss": 0.1327, "step": 5827 }, { "epoch": 0.45373947739769355, "grad_norm": 1.1879841336327928, "learning_rate": 2.358706429272779e-05, "loss": 0.141, "step": 5828 }, { "epoch": 0.4538173324899032, "grad_norm": 1.1553861977183713, "learning_rate": 2.3582153319164953e-05, "loss": 0.1312, "step": 5829 }, { "epoch": 0.4538951875821128, "grad_norm": 1.2504779588759092, "learning_rate": 2.35772421224508e-05, "loss": 0.184, "step": 5830 }, { "epoch": 0.4539730426743224, "grad_norm": 1.1522473924785548, "learning_rate": 2.357233070289127e-05, "loss": 0.1446, "step": 5831 }, { "epoch": 0.45405089776653207, "grad_norm": 1.1547043928622513, "learning_rate": 2.3567419060792337e-05, "loss": 0.1546, "step": 5832 }, { "epoch": 0.45412875285874166, "grad_norm": 1.1733872842486186, "learning_rate": 2.3562507196459964e-05, "loss": 0.1236, "step": 5833 }, { "epoch": 0.4542066079509513, "grad_norm": 1.092080362010714, "learning_rate": 2.3557595110200134e-05, "loss": 0.1312, "step": 5834 }, { "epoch": 0.4542844630431609, "grad_norm": 1.2417988740585544, "learning_rate": 2.3552682802318857e-05, "loss": 0.1624, "step": 5835 }, { "epoch": 0.45436231813537054, "grad_norm": 1.2741753080507217, "learning_rate": 2.354777027312213e-05, "loss": 0.1423, "step": 5836 }, { "epoch": 0.4544401732275802, "grad_norm": 1.278377618025668, "learning_rate": 2.3542857522915997e-05, "loss": 0.1673, "step": 5837 }, { "epoch": 0.45451802831978977, "grad_norm": 1.2747375438616857, "learning_rate": 2.35379445520065e-05, "loss": 0.1501, "step": 5838 }, { "epoch": 0.4545958834119994, "grad_norm": 1.1007096942653902, "learning_rate": 2.3533031360699678e-05, "loss": 0.1231, "step": 5839 }, { "epoch": 0.45467373850420906, "grad_norm": 1.2751938674599335, "learning_rate": 2.3528117949301614e-05, "loss": 0.1506, "step": 5840 }, { "epoch": 0.45475159359641865, "grad_norm": 1.220941674231852, "learning_rate": 2.352320431811838e-05, "loss": 0.1491, "step": 5841 }, { "epoch": 0.4548294486886283, "grad_norm": 1.1513996432106817, "learning_rate": 2.3518290467456085e-05, "loss": 0.1456, "step": 5842 }, { "epoch": 0.45490730378083793, "grad_norm": 1.1130835591377337, "learning_rate": 2.3513376397620832e-05, "loss": 0.124, "step": 5843 }, { "epoch": 0.4549851588730475, "grad_norm": 1.031238265105074, "learning_rate": 2.3508462108918733e-05, "loss": 0.1369, "step": 5844 }, { "epoch": 0.45506301396525717, "grad_norm": 1.2453261849521349, "learning_rate": 2.350354760165594e-05, "loss": 0.1364, "step": 5845 }, { "epoch": 0.4551408690574668, "grad_norm": 1.2922874662203294, "learning_rate": 2.3498632876138602e-05, "loss": 0.1633, "step": 5846 }, { "epoch": 0.4552187241496764, "grad_norm": 1.2292222815922962, "learning_rate": 2.3493717932672882e-05, "loss": 0.1713, "step": 5847 }, { "epoch": 0.45529657924188605, "grad_norm": 1.2279754516487118, "learning_rate": 2.3488802771564954e-05, "loss": 0.1356, "step": 5848 }, { "epoch": 0.4553744343340957, "grad_norm": 1.1880488060577046, "learning_rate": 2.3483887393121013e-05, "loss": 0.1479, "step": 5849 }, { "epoch": 0.4554522894263053, "grad_norm": 1.1101257253146557, "learning_rate": 2.3478971797647265e-05, "loss": 0.1264, "step": 5850 }, { "epoch": 0.4554522894263053, "eval_loss": 0.019073592498898506, "eval_runtime": 166.5351, "eval_samples_per_second": 17.294, "eval_steps_per_second": 0.618, "step": 5850 }, { "epoch": 0.4555301445185149, "grad_norm": 1.1813189562522939, "learning_rate": 2.3474055985449925e-05, "loss": 0.1468, "step": 5851 }, { "epoch": 0.4556079996107245, "grad_norm": 1.1964307155661207, "learning_rate": 2.3469139956835226e-05, "loss": 0.1704, "step": 5852 }, { "epoch": 0.45568585470293416, "grad_norm": 1.333254591579924, "learning_rate": 2.346422371210942e-05, "loss": 0.1781, "step": 5853 }, { "epoch": 0.4557637097951438, "grad_norm": 1.268178782902813, "learning_rate": 2.3459307251578743e-05, "loss": 0.1353, "step": 5854 }, { "epoch": 0.4558415648873534, "grad_norm": 1.151121878719906, "learning_rate": 2.3454390575549502e-05, "loss": 0.1362, "step": 5855 }, { "epoch": 0.45591941997956303, "grad_norm": 1.287512754462705, "learning_rate": 2.344947368432796e-05, "loss": 0.1609, "step": 5856 }, { "epoch": 0.4559972750717727, "grad_norm": 1.1025857443637865, "learning_rate": 2.344455657822042e-05, "loss": 0.1779, "step": 5857 }, { "epoch": 0.45607513016398227, "grad_norm": 1.1705465243979, "learning_rate": 2.3439639257533197e-05, "loss": 0.128, "step": 5858 }, { "epoch": 0.4561529852561919, "grad_norm": 1.2544951444867065, "learning_rate": 2.3434721722572615e-05, "loss": 0.1622, "step": 5859 }, { "epoch": 0.45623084034840156, "grad_norm": 1.3129617848510697, "learning_rate": 2.3429803973645012e-05, "loss": 0.1607, "step": 5860 }, { "epoch": 0.45630869544061115, "grad_norm": 1.1732997200763862, "learning_rate": 2.3424886011056746e-05, "loss": 0.1563, "step": 5861 }, { "epoch": 0.4563865505328208, "grad_norm": 1.2108293324866204, "learning_rate": 2.341996783511418e-05, "loss": 0.1741, "step": 5862 }, { "epoch": 0.45646440562503043, "grad_norm": 1.0848416803362697, "learning_rate": 2.3415049446123694e-05, "loss": 0.1446, "step": 5863 }, { "epoch": 0.45654226071724, "grad_norm": 1.2321765062206786, "learning_rate": 2.3410130844391675e-05, "loss": 0.1555, "step": 5864 }, { "epoch": 0.45662011580944967, "grad_norm": 1.279298637288307, "learning_rate": 2.340521203022454e-05, "loss": 0.2035, "step": 5865 }, { "epoch": 0.4566979709016593, "grad_norm": 1.1020425753300893, "learning_rate": 2.34002930039287e-05, "loss": 0.1505, "step": 5866 }, { "epoch": 0.4567758259938689, "grad_norm": 1.0834551699472232, "learning_rate": 2.3395373765810584e-05, "loss": 0.1223, "step": 5867 }, { "epoch": 0.45685368108607854, "grad_norm": 1.1138563894462912, "learning_rate": 2.339045431617665e-05, "loss": 0.126, "step": 5868 }, { "epoch": 0.45693153617828813, "grad_norm": 1.0770530781472853, "learning_rate": 2.3385534655333343e-05, "loss": 0.1281, "step": 5869 }, { "epoch": 0.4570093912704978, "grad_norm": 1.1596212030219235, "learning_rate": 2.338061478358714e-05, "loss": 0.1393, "step": 5870 }, { "epoch": 0.4570872463627074, "grad_norm": 1.14691652381503, "learning_rate": 2.3375694701244527e-05, "loss": 0.1403, "step": 5871 }, { "epoch": 0.457165101454917, "grad_norm": 1.2206332428733044, "learning_rate": 2.3370774408612e-05, "loss": 0.1554, "step": 5872 }, { "epoch": 0.45724295654712666, "grad_norm": 1.2504141359815697, "learning_rate": 2.3365853905996074e-05, "loss": 0.1628, "step": 5873 }, { "epoch": 0.4573208116393363, "grad_norm": 1.207371681503008, "learning_rate": 2.336093319370327e-05, "loss": 0.1677, "step": 5874 }, { "epoch": 0.4573986667315459, "grad_norm": 1.2720029478822261, "learning_rate": 2.3356012272040122e-05, "loss": 0.1548, "step": 5875 }, { "epoch": 0.45747652182375553, "grad_norm": 1.2022307334834106, "learning_rate": 2.335109114131319e-05, "loss": 0.1674, "step": 5876 }, { "epoch": 0.4575543769159652, "grad_norm": 1.16020970107446, "learning_rate": 2.334616980182903e-05, "loss": 0.1375, "step": 5877 }, { "epoch": 0.45763223200817477, "grad_norm": 1.1826969007705126, "learning_rate": 2.334124825389422e-05, "loss": 0.1546, "step": 5878 }, { "epoch": 0.4577100871003844, "grad_norm": 1.130533806677933, "learning_rate": 2.3336326497815348e-05, "loss": 0.1529, "step": 5879 }, { "epoch": 0.45778794219259406, "grad_norm": 1.2491164742975234, "learning_rate": 2.333140453389901e-05, "loss": 0.1412, "step": 5880 }, { "epoch": 0.45786579728480364, "grad_norm": 1.180897640912191, "learning_rate": 2.3326482362451844e-05, "loss": 0.1678, "step": 5881 }, { "epoch": 0.4579436523770133, "grad_norm": 1.101829209018883, "learning_rate": 2.3321559983780452e-05, "loss": 0.1434, "step": 5882 }, { "epoch": 0.45802150746922293, "grad_norm": 1.0220487517465346, "learning_rate": 2.3316637398191487e-05, "loss": 0.1007, "step": 5883 }, { "epoch": 0.4580993625614325, "grad_norm": 1.1471247130540885, "learning_rate": 2.3311714605991603e-05, "loss": 0.1559, "step": 5884 }, { "epoch": 0.45817721765364217, "grad_norm": 1.1713508226275653, "learning_rate": 2.330679160748746e-05, "loss": 0.1663, "step": 5885 }, { "epoch": 0.4582550727458518, "grad_norm": 1.4020277289084897, "learning_rate": 2.3301868402985757e-05, "loss": 0.1703, "step": 5886 }, { "epoch": 0.4583329278380614, "grad_norm": 1.1810518786736115, "learning_rate": 2.3296944992793163e-05, "loss": 0.1565, "step": 5887 }, { "epoch": 0.45841078293027104, "grad_norm": 1.1333839564349588, "learning_rate": 2.3292021377216393e-05, "loss": 0.108, "step": 5888 }, { "epoch": 0.45848863802248063, "grad_norm": 1.0576456679651296, "learning_rate": 2.3287097556562162e-05, "loss": 0.1343, "step": 5889 }, { "epoch": 0.4585664931146903, "grad_norm": 1.2443967633839388, "learning_rate": 2.3282173531137212e-05, "loss": 0.1586, "step": 5890 }, { "epoch": 0.4586443482068999, "grad_norm": 1.2036222419232803, "learning_rate": 2.3277249301248276e-05, "loss": 0.1352, "step": 5891 }, { "epoch": 0.4587222032991095, "grad_norm": 1.2634673604341762, "learning_rate": 2.327232486720211e-05, "loss": 0.1599, "step": 5892 }, { "epoch": 0.45880005839131915, "grad_norm": 1.066227142880493, "learning_rate": 2.3267400229305483e-05, "loss": 0.1356, "step": 5893 }, { "epoch": 0.4588779134835288, "grad_norm": 1.1392307861911533, "learning_rate": 2.3262475387865187e-05, "loss": 0.1334, "step": 5894 }, { "epoch": 0.4589557685757384, "grad_norm": 1.248275259150815, "learning_rate": 2.3257550343188005e-05, "loss": 0.1836, "step": 5895 }, { "epoch": 0.45903362366794803, "grad_norm": 1.1190201635126888, "learning_rate": 2.3252625095580753e-05, "loss": 0.1409, "step": 5896 }, { "epoch": 0.4591114787601577, "grad_norm": 1.1545113499933297, "learning_rate": 2.3247699645350244e-05, "loss": 0.1216, "step": 5897 }, { "epoch": 0.45918933385236727, "grad_norm": 1.057768275330854, "learning_rate": 2.3242773992803306e-05, "loss": 0.11, "step": 5898 }, { "epoch": 0.4592671889445769, "grad_norm": 1.2128385813808702, "learning_rate": 2.3237848138246796e-05, "loss": 0.1124, "step": 5899 }, { "epoch": 0.45934504403678655, "grad_norm": 1.1710636896496234, "learning_rate": 2.323292208198757e-05, "loss": 0.131, "step": 5900 }, { "epoch": 0.45934504403678655, "eval_loss": 0.01851610653102398, "eval_runtime": 166.8183, "eval_samples_per_second": 17.264, "eval_steps_per_second": 0.617, "step": 5900 }, { "epoch": 0.45942289912899614, "grad_norm": 1.1658037471840583, "learning_rate": 2.3227995824332493e-05, "loss": 0.1261, "step": 5901 }, { "epoch": 0.4595007542212058, "grad_norm": 1.1810682550653442, "learning_rate": 2.3223069365588446e-05, "loss": 0.1421, "step": 5902 }, { "epoch": 0.45957860931341543, "grad_norm": 1.1285865799579151, "learning_rate": 2.3218142706062327e-05, "loss": 0.1737, "step": 5903 }, { "epoch": 0.459656464405625, "grad_norm": 1.1525381925420743, "learning_rate": 2.3213215846061042e-05, "loss": 0.1375, "step": 5904 }, { "epoch": 0.45973431949783466, "grad_norm": 1.2231661499406963, "learning_rate": 2.320828878589152e-05, "loss": 0.1595, "step": 5905 }, { "epoch": 0.45981217459004425, "grad_norm": 1.2411950323135543, "learning_rate": 2.3203361525860685e-05, "loss": 0.1746, "step": 5906 }, { "epoch": 0.4598900296822539, "grad_norm": 1.1878137753989786, "learning_rate": 2.3198434066275484e-05, "loss": 0.1784, "step": 5907 }, { "epoch": 0.45996788477446354, "grad_norm": 1.2009890022667162, "learning_rate": 2.3193506407442872e-05, "loss": 0.1396, "step": 5908 }, { "epoch": 0.46004573986667313, "grad_norm": 1.150307751267974, "learning_rate": 2.3188578549669824e-05, "loss": 0.1546, "step": 5909 }, { "epoch": 0.4601235949588828, "grad_norm": 1.1779406808242912, "learning_rate": 2.318365049326332e-05, "loss": 0.1564, "step": 5910 }, { "epoch": 0.4602014500510924, "grad_norm": 1.1888009905497732, "learning_rate": 2.3178722238530352e-05, "loss": 0.1302, "step": 5911 }, { "epoch": 0.460279305143302, "grad_norm": 1.0667304691007058, "learning_rate": 2.3173793785777933e-05, "loss": 0.1236, "step": 5912 }, { "epoch": 0.46035716023551165, "grad_norm": 1.221465450762907, "learning_rate": 2.3168865135313078e-05, "loss": 0.1331, "step": 5913 }, { "epoch": 0.4604350153277213, "grad_norm": 1.2625083350951598, "learning_rate": 2.3163936287442818e-05, "loss": 0.1735, "step": 5914 }, { "epoch": 0.4605128704199309, "grad_norm": 1.1283822768590444, "learning_rate": 2.3159007242474205e-05, "loss": 0.1183, "step": 5915 }, { "epoch": 0.46059072551214053, "grad_norm": 1.1337075760965705, "learning_rate": 2.3154078000714285e-05, "loss": 0.1264, "step": 5916 }, { "epoch": 0.4606685806043502, "grad_norm": 1.2701560436401613, "learning_rate": 2.314914856247013e-05, "loss": 0.1371, "step": 5917 }, { "epoch": 0.46074643569655976, "grad_norm": 1.1542318318215556, "learning_rate": 2.314421892804882e-05, "loss": 0.1119, "step": 5918 }, { "epoch": 0.4608242907887694, "grad_norm": 1.2189413840395475, "learning_rate": 2.313928909775746e-05, "loss": 0.1449, "step": 5919 }, { "epoch": 0.46090214588097905, "grad_norm": 1.1785998944901446, "learning_rate": 2.3134359071903137e-05, "loss": 0.1673, "step": 5920 }, { "epoch": 0.46098000097318864, "grad_norm": 1.0676350534051682, "learning_rate": 2.3129428850792976e-05, "loss": 0.1216, "step": 5921 }, { "epoch": 0.4610578560653983, "grad_norm": 1.18804617774387, "learning_rate": 2.3124498434734107e-05, "loss": 0.1577, "step": 5922 }, { "epoch": 0.4611357111576079, "grad_norm": 1.2229841075591499, "learning_rate": 2.3119567824033676e-05, "loss": 0.1784, "step": 5923 }, { "epoch": 0.4612135662498175, "grad_norm": 1.1320731172208653, "learning_rate": 2.3114637018998832e-05, "loss": 0.1329, "step": 5924 }, { "epoch": 0.46129142134202716, "grad_norm": 1.1438837151374304, "learning_rate": 2.3109706019936743e-05, "loss": 0.1283, "step": 5925 }, { "epoch": 0.46136927643423675, "grad_norm": 1.1376924674872908, "learning_rate": 2.310477482715458e-05, "loss": 0.1583, "step": 5926 }, { "epoch": 0.4614471315264464, "grad_norm": 1.1011651161452176, "learning_rate": 2.309984344095954e-05, "loss": 0.1488, "step": 5927 }, { "epoch": 0.46152498661865604, "grad_norm": 1.0953534904874151, "learning_rate": 2.3094911861658833e-05, "loss": 0.1121, "step": 5928 }, { "epoch": 0.46160284171086563, "grad_norm": 1.1933865738255849, "learning_rate": 2.308998008955966e-05, "loss": 0.127, "step": 5929 }, { "epoch": 0.4616806968030753, "grad_norm": 1.1664314556878252, "learning_rate": 2.3085048124969247e-05, "loss": 0.1185, "step": 5930 }, { "epoch": 0.4617585518952849, "grad_norm": 1.1604825408009958, "learning_rate": 2.3080115968194838e-05, "loss": 0.1756, "step": 5931 }, { "epoch": 0.4618364069874945, "grad_norm": 1.1095286941372016, "learning_rate": 2.3075183619543685e-05, "loss": 0.1171, "step": 5932 }, { "epoch": 0.46191426207970415, "grad_norm": 1.1438675633052804, "learning_rate": 2.3070251079323044e-05, "loss": 0.1255, "step": 5933 }, { "epoch": 0.4619921171719138, "grad_norm": 1.2670795252721263, "learning_rate": 2.3065318347840195e-05, "loss": 0.1529, "step": 5934 }, { "epoch": 0.4620699722641234, "grad_norm": 1.2577793014045526, "learning_rate": 2.3060385425402422e-05, "loss": 0.1567, "step": 5935 }, { "epoch": 0.46214782735633303, "grad_norm": 1.1541599143487313, "learning_rate": 2.3055452312317018e-05, "loss": 0.1504, "step": 5936 }, { "epoch": 0.4622256824485427, "grad_norm": 1.1771679846005867, "learning_rate": 2.30505190088913e-05, "loss": 0.1364, "step": 5937 }, { "epoch": 0.46230353754075226, "grad_norm": 1.0401721147518113, "learning_rate": 2.3045585515432583e-05, "loss": 0.1433, "step": 5938 }, { "epoch": 0.4623813926329619, "grad_norm": 1.1940488671027014, "learning_rate": 2.3040651832248208e-05, "loss": 0.1739, "step": 5939 }, { "epoch": 0.46245924772517155, "grad_norm": 1.1828488335757243, "learning_rate": 2.303571795964552e-05, "loss": 0.1481, "step": 5940 }, { "epoch": 0.46253710281738114, "grad_norm": 1.1701213656473322, "learning_rate": 2.3030783897931862e-05, "loss": 0.1202, "step": 5941 }, { "epoch": 0.4626149579095908, "grad_norm": 1.1843044614716702, "learning_rate": 2.302584964741462e-05, "loss": 0.1286, "step": 5942 }, { "epoch": 0.4626928130018004, "grad_norm": 1.188339793810509, "learning_rate": 2.302091520840117e-05, "loss": 0.1503, "step": 5943 }, { "epoch": 0.46277066809401, "grad_norm": 1.0794476093986316, "learning_rate": 2.3015980581198894e-05, "loss": 0.1323, "step": 5944 }, { "epoch": 0.46284852318621966, "grad_norm": 1.112422587322072, "learning_rate": 2.301104576611521e-05, "loss": 0.1295, "step": 5945 }, { "epoch": 0.46292637827842925, "grad_norm": 1.2482236988105295, "learning_rate": 2.3006110763457525e-05, "loss": 0.1516, "step": 5946 }, { "epoch": 0.4630042333706389, "grad_norm": 1.184476273355181, "learning_rate": 2.3001175573533267e-05, "loss": 0.134, "step": 5947 }, { "epoch": 0.46308208846284854, "grad_norm": 1.09591811216186, "learning_rate": 2.2996240196649888e-05, "loss": 0.1405, "step": 5948 }, { "epoch": 0.46315994355505813, "grad_norm": 1.1979678723836071, "learning_rate": 2.2991304633114816e-05, "loss": 0.1433, "step": 5949 }, { "epoch": 0.4632377986472678, "grad_norm": 1.1997684972092055, "learning_rate": 2.298636888323553e-05, "loss": 0.1445, "step": 5950 }, { "epoch": 0.4632377986472678, "eval_loss": 0.018290504813194275, "eval_runtime": 166.6813, "eval_samples_per_second": 17.278, "eval_steps_per_second": 0.618, "step": 5950 }, { "epoch": 0.4633156537394774, "grad_norm": 1.1368557449059808, "learning_rate": 2.29814329473195e-05, "loss": 0.1429, "step": 5951 }, { "epoch": 0.463393508831687, "grad_norm": 1.2319793296401038, "learning_rate": 2.2976496825674217e-05, "loss": 0.1463, "step": 5952 }, { "epoch": 0.46347136392389665, "grad_norm": 1.1780807875597543, "learning_rate": 2.297156051860717e-05, "loss": 0.1252, "step": 5953 }, { "epoch": 0.4635492190161063, "grad_norm": 1.1768101999699065, "learning_rate": 2.296662402642586e-05, "loss": 0.1611, "step": 5954 }, { "epoch": 0.4636270741083159, "grad_norm": 1.1697552131562945, "learning_rate": 2.296168734943783e-05, "loss": 0.152, "step": 5955 }, { "epoch": 0.46370492920052553, "grad_norm": 1.174674923187968, "learning_rate": 2.2956750487950595e-05, "loss": 0.1378, "step": 5956 }, { "epoch": 0.4637827842927352, "grad_norm": 1.1591459990634116, "learning_rate": 2.2951813442271706e-05, "loss": 0.1486, "step": 5957 }, { "epoch": 0.46386063938494476, "grad_norm": 1.1623098210354994, "learning_rate": 2.2946876212708706e-05, "loss": 0.1754, "step": 5958 }, { "epoch": 0.4639384944771544, "grad_norm": 1.1606827599144411, "learning_rate": 2.2941938799569173e-05, "loss": 0.1447, "step": 5959 }, { "epoch": 0.464016349569364, "grad_norm": 1.2100888872283535, "learning_rate": 2.2937001203160683e-05, "loss": 0.1556, "step": 5960 }, { "epoch": 0.46409420466157364, "grad_norm": 1.1210853347068768, "learning_rate": 2.2932063423790823e-05, "loss": 0.1465, "step": 5961 }, { "epoch": 0.4641720597537833, "grad_norm": 1.072481985537946, "learning_rate": 2.2927125461767195e-05, "loss": 0.1579, "step": 5962 }, { "epoch": 0.4642499148459929, "grad_norm": 1.119953796359417, "learning_rate": 2.2922187317397408e-05, "loss": 0.139, "step": 5963 }, { "epoch": 0.4643277699382025, "grad_norm": 1.2296820733839497, "learning_rate": 2.2917248990989086e-05, "loss": 0.1249, "step": 5964 }, { "epoch": 0.46440562503041216, "grad_norm": 1.2093187061136883, "learning_rate": 2.291231048284986e-05, "loss": 0.1467, "step": 5965 }, { "epoch": 0.46448348012262175, "grad_norm": 1.1899364189684696, "learning_rate": 2.2907371793287383e-05, "loss": 0.165, "step": 5966 }, { "epoch": 0.4645613352148314, "grad_norm": 1.2497773156252316, "learning_rate": 2.2902432922609313e-05, "loss": 0.1541, "step": 5967 }, { "epoch": 0.46463919030704104, "grad_norm": 1.2952519523347303, "learning_rate": 2.2897493871123313e-05, "loss": 0.1601, "step": 5968 }, { "epoch": 0.46471704539925063, "grad_norm": 1.334143164485398, "learning_rate": 2.2892554639137065e-05, "loss": 0.1684, "step": 5969 }, { "epoch": 0.46479490049146027, "grad_norm": 1.2426192114477472, "learning_rate": 2.2887615226958252e-05, "loss": 0.1839, "step": 5970 }, { "epoch": 0.4648727555836699, "grad_norm": 1.1892013557685548, "learning_rate": 2.288267563489459e-05, "loss": 0.1523, "step": 5971 }, { "epoch": 0.4649506106758795, "grad_norm": 1.2220147579924907, "learning_rate": 2.2877735863253783e-05, "loss": 0.1448, "step": 5972 }, { "epoch": 0.46502846576808915, "grad_norm": 1.0376337360584909, "learning_rate": 2.287279591234356e-05, "loss": 0.1158, "step": 5973 }, { "epoch": 0.4651063208602988, "grad_norm": 1.1079054764591245, "learning_rate": 2.286785578247166e-05, "loss": 0.1469, "step": 5974 }, { "epoch": 0.4651841759525084, "grad_norm": 1.1147048244522872, "learning_rate": 2.2862915473945812e-05, "loss": 0.1422, "step": 5975 }, { "epoch": 0.465262031044718, "grad_norm": 1.110707375968459, "learning_rate": 2.2857974987073798e-05, "loss": 0.1327, "step": 5976 }, { "epoch": 0.4653398861369276, "grad_norm": 1.1506846977390408, "learning_rate": 2.285303432216337e-05, "loss": 0.1663, "step": 5977 }, { "epoch": 0.46541774122913726, "grad_norm": 1.1674612031813265, "learning_rate": 2.284809347952232e-05, "loss": 0.144, "step": 5978 }, { "epoch": 0.4654955963213469, "grad_norm": 1.1611747061813718, "learning_rate": 2.284315245945843e-05, "loss": 0.1014, "step": 5979 }, { "epoch": 0.4655734514135565, "grad_norm": 1.106922897248994, "learning_rate": 2.28382112622795e-05, "loss": 0.1408, "step": 5980 }, { "epoch": 0.46565130650576614, "grad_norm": 1.1092547200533076, "learning_rate": 2.283326988829336e-05, "loss": 0.1299, "step": 5981 }, { "epoch": 0.4657291615979758, "grad_norm": 1.1053282641396605, "learning_rate": 2.2828328337807815e-05, "loss": 0.1464, "step": 5982 }, { "epoch": 0.46580701669018537, "grad_norm": 1.2887023351096383, "learning_rate": 2.2823386611130715e-05, "loss": 0.138, "step": 5983 }, { "epoch": 0.465884871782395, "grad_norm": 1.1620572658750565, "learning_rate": 2.2818444708569896e-05, "loss": 0.1892, "step": 5984 }, { "epoch": 0.46596272687460466, "grad_norm": 1.1516641440583424, "learning_rate": 2.281350263043322e-05, "loss": 0.1652, "step": 5985 }, { "epoch": 0.46604058196681425, "grad_norm": 1.2205967688243233, "learning_rate": 2.2808560377028562e-05, "loss": 0.1164, "step": 5986 }, { "epoch": 0.4661184370590239, "grad_norm": 1.2347025702041723, "learning_rate": 2.280361794866379e-05, "loss": 0.1451, "step": 5987 }, { "epoch": 0.46619629215123354, "grad_norm": 1.2200006694209615, "learning_rate": 2.2798675345646787e-05, "loss": 0.1641, "step": 5988 }, { "epoch": 0.4662741472434431, "grad_norm": 1.1408359476063488, "learning_rate": 2.2793732568285476e-05, "loss": 0.119, "step": 5989 }, { "epoch": 0.46635200233565277, "grad_norm": 1.1594088577862927, "learning_rate": 2.2788789616887757e-05, "loss": 0.1278, "step": 5990 }, { "epoch": 0.4664298574278624, "grad_norm": 1.0952733529464391, "learning_rate": 2.2783846491761552e-05, "loss": 0.1234, "step": 5991 }, { "epoch": 0.466507712520072, "grad_norm": 1.278162358658716, "learning_rate": 2.2778903193214793e-05, "loss": 0.1828, "step": 5992 }, { "epoch": 0.46658556761228165, "grad_norm": 1.1613660977936116, "learning_rate": 2.2773959721555428e-05, "loss": 0.1185, "step": 5993 }, { "epoch": 0.4666634227044913, "grad_norm": 1.1417669126597039, "learning_rate": 2.2769016077091415e-05, "loss": 0.1367, "step": 5994 }, { "epoch": 0.4667412777967009, "grad_norm": 1.1371485287850553, "learning_rate": 2.2764072260130714e-05, "loss": 0.1425, "step": 5995 }, { "epoch": 0.4668191328889105, "grad_norm": 1.3161181407966314, "learning_rate": 2.27591282709813e-05, "loss": 0.1589, "step": 5996 }, { "epoch": 0.4668969879811201, "grad_norm": 1.1469057959391993, "learning_rate": 2.275418410995117e-05, "loss": 0.1601, "step": 5997 }, { "epoch": 0.46697484307332976, "grad_norm": 1.1541205326970947, "learning_rate": 2.2749239777348308e-05, "loss": 0.1606, "step": 5998 }, { "epoch": 0.4670526981655394, "grad_norm": 1.1949073005661923, "learning_rate": 2.2744295273480733e-05, "loss": 0.1368, "step": 5999 }, { "epoch": 0.467130553257749, "grad_norm": 1.0661939685455564, "learning_rate": 2.2739350598656468e-05, "loss": 0.1323, "step": 6000 }, { "epoch": 0.467130553257749, "eval_loss": 0.01800386793911457, "eval_runtime": 166.6749, "eval_samples_per_second": 17.279, "eval_steps_per_second": 0.618, "step": 6000 }, { "epoch": 0.46720840834995864, "grad_norm": 1.0849060175228786, "learning_rate": 2.273440575318353e-05, "loss": 0.1239, "step": 6001 }, { "epoch": 0.4672862634421683, "grad_norm": 1.1175466408757408, "learning_rate": 2.2729460737369974e-05, "loss": 0.1306, "step": 6002 }, { "epoch": 0.46736411853437787, "grad_norm": 1.1121467240156995, "learning_rate": 2.272451555152383e-05, "loss": 0.1625, "step": 6003 }, { "epoch": 0.4674419736265875, "grad_norm": 1.0845377553585442, "learning_rate": 2.2719570195953184e-05, "loss": 0.1303, "step": 6004 }, { "epoch": 0.46751982871879716, "grad_norm": 1.078142345940061, "learning_rate": 2.27146246709661e-05, "loss": 0.1227, "step": 6005 }, { "epoch": 0.46759768381100675, "grad_norm": 1.180211486221926, "learning_rate": 2.2709678976870654e-05, "loss": 0.1306, "step": 6006 }, { "epoch": 0.4676755389032164, "grad_norm": 1.212246187148338, "learning_rate": 2.2704733113974947e-05, "loss": 0.1696, "step": 6007 }, { "epoch": 0.46775339399542604, "grad_norm": 1.0817487708425786, "learning_rate": 2.2699787082587075e-05, "loss": 0.1526, "step": 6008 }, { "epoch": 0.4678312490876356, "grad_norm": 1.1932227135898221, "learning_rate": 2.2694840883015162e-05, "loss": 0.1404, "step": 6009 }, { "epoch": 0.46790910417984527, "grad_norm": 1.1792217114759793, "learning_rate": 2.2689894515567333e-05, "loss": 0.1238, "step": 6010 }, { "epoch": 0.4679869592720549, "grad_norm": 1.1791309970471182, "learning_rate": 2.2684947980551712e-05, "loss": 0.1154, "step": 6011 }, { "epoch": 0.4680648143642645, "grad_norm": 1.0960906537622463, "learning_rate": 2.2680001278276455e-05, "loss": 0.1189, "step": 6012 }, { "epoch": 0.46814266945647415, "grad_norm": 1.230763560635609, "learning_rate": 2.2675054409049713e-05, "loss": 0.1435, "step": 6013 }, { "epoch": 0.46822052454868374, "grad_norm": 0.9605587298257636, "learning_rate": 2.2670107373179654e-05, "loss": 0.1008, "step": 6014 }, { "epoch": 0.4682983796408934, "grad_norm": 1.3032241171205878, "learning_rate": 2.2665160170974466e-05, "loss": 0.1655, "step": 6015 }, { "epoch": 0.468376234733103, "grad_norm": 1.1673429071841337, "learning_rate": 2.266021280274232e-05, "loss": 0.1477, "step": 6016 }, { "epoch": 0.4684540898253126, "grad_norm": 1.118882230208271, "learning_rate": 2.265526526879142e-05, "loss": 0.1167, "step": 6017 }, { "epoch": 0.46853194491752226, "grad_norm": 1.2635275129942674, "learning_rate": 2.265031756942997e-05, "loss": 0.1833, "step": 6018 }, { "epoch": 0.4686098000097319, "grad_norm": 1.076828581051791, "learning_rate": 2.2645369704966207e-05, "loss": 0.1289, "step": 6019 }, { "epoch": 0.4686876551019415, "grad_norm": 1.1433806150508952, "learning_rate": 2.2640421675708338e-05, "loss": 0.1505, "step": 6020 }, { "epoch": 0.46876551019415114, "grad_norm": 1.162430566851474, "learning_rate": 2.2635473481964603e-05, "loss": 0.1332, "step": 6021 }, { "epoch": 0.4688433652863608, "grad_norm": 1.0919216590870269, "learning_rate": 2.2630525124043264e-05, "loss": 0.1449, "step": 6022 }, { "epoch": 0.46892122037857037, "grad_norm": 1.1669673044105175, "learning_rate": 2.2625576602252575e-05, "loss": 0.1582, "step": 6023 }, { "epoch": 0.46899907547078, "grad_norm": 1.186206152744438, "learning_rate": 2.2620627916900804e-05, "loss": 0.1377, "step": 6024 }, { "epoch": 0.46907693056298966, "grad_norm": 1.108284072822363, "learning_rate": 2.2615679068296236e-05, "loss": 0.1423, "step": 6025 }, { "epoch": 0.46915478565519925, "grad_norm": 1.1524163319003704, "learning_rate": 2.2610730056747147e-05, "loss": 0.1315, "step": 6026 }, { "epoch": 0.4692326407474089, "grad_norm": 1.1368879375883865, "learning_rate": 2.2605780882561854e-05, "loss": 0.1455, "step": 6027 }, { "epoch": 0.46931049583961854, "grad_norm": 1.1739938972785422, "learning_rate": 2.260083154604866e-05, "loss": 0.1225, "step": 6028 }, { "epoch": 0.4693883509318281, "grad_norm": 1.1620157613342097, "learning_rate": 2.2595882047515888e-05, "loss": 0.1408, "step": 6029 }, { "epoch": 0.46946620602403777, "grad_norm": 1.1872578446274844, "learning_rate": 2.259093238727186e-05, "loss": 0.1487, "step": 6030 }, { "epoch": 0.46954406111624736, "grad_norm": 1.181258479270362, "learning_rate": 2.2585982565624924e-05, "loss": 0.1642, "step": 6031 }, { "epoch": 0.469621916208457, "grad_norm": 1.1576626713391425, "learning_rate": 2.258103258288343e-05, "loss": 0.1217, "step": 6032 }, { "epoch": 0.46969977130066665, "grad_norm": 1.0505062864919872, "learning_rate": 2.257608243935574e-05, "loss": 0.1281, "step": 6033 }, { "epoch": 0.46977762639287624, "grad_norm": 1.1045383839161511, "learning_rate": 2.2571132135350223e-05, "loss": 0.1415, "step": 6034 }, { "epoch": 0.4698554814850859, "grad_norm": 1.1602812274926089, "learning_rate": 2.256618167117526e-05, "loss": 0.1236, "step": 6035 }, { "epoch": 0.4699333365772955, "grad_norm": 1.1935393647814134, "learning_rate": 2.256123104713923e-05, "loss": 0.1408, "step": 6036 }, { "epoch": 0.4700111916695051, "grad_norm": 1.224669039843139, "learning_rate": 2.255628026355056e-05, "loss": 0.1514, "step": 6037 }, { "epoch": 0.47008904676171476, "grad_norm": 1.267942373961983, "learning_rate": 2.2551329320717636e-05, "loss": 0.1738, "step": 6038 }, { "epoch": 0.4701669018539244, "grad_norm": 1.2544236276453997, "learning_rate": 2.254637821894889e-05, "loss": 0.1498, "step": 6039 }, { "epoch": 0.470244756946134, "grad_norm": 1.1172376140904328, "learning_rate": 2.254142695855275e-05, "loss": 0.1386, "step": 6040 }, { "epoch": 0.47032261203834363, "grad_norm": 1.157449919842702, "learning_rate": 2.253647553983766e-05, "loss": 0.1191, "step": 6041 }, { "epoch": 0.4704004671305533, "grad_norm": 1.1048002954567375, "learning_rate": 2.253152396311206e-05, "loss": 0.1377, "step": 6042 }, { "epoch": 0.47047832222276287, "grad_norm": 1.1494057733784377, "learning_rate": 2.2526572228684424e-05, "loss": 0.1589, "step": 6043 }, { "epoch": 0.4705561773149725, "grad_norm": 1.1231849678470618, "learning_rate": 2.2521620336863207e-05, "loss": 0.1458, "step": 6044 }, { "epoch": 0.47063403240718216, "grad_norm": 1.2260172304897963, "learning_rate": 2.2516668287956897e-05, "loss": 0.148, "step": 6045 }, { "epoch": 0.47071188749939175, "grad_norm": 1.2039979017737554, "learning_rate": 2.2511716082273988e-05, "loss": 0.1168, "step": 6046 }, { "epoch": 0.4707897425916014, "grad_norm": 1.1537239937155286, "learning_rate": 2.250676372012296e-05, "loss": 0.1388, "step": 6047 }, { "epoch": 0.47086759768381103, "grad_norm": 1.161214859270076, "learning_rate": 2.2501811201812348e-05, "loss": 0.1236, "step": 6048 }, { "epoch": 0.4709454527760206, "grad_norm": 1.160923507702245, "learning_rate": 2.2496858527650653e-05, "loss": 0.1466, "step": 6049 }, { "epoch": 0.47102330786823027, "grad_norm": 1.1827972706878482, "learning_rate": 2.2491905697946405e-05, "loss": 0.1365, "step": 6050 }, { "epoch": 0.47102330786823027, "eval_loss": 0.017664916813373566, "eval_runtime": 167.1158, "eval_samples_per_second": 17.234, "eval_steps_per_second": 0.616, "step": 6050 }, { "epoch": 0.47110116296043986, "grad_norm": 1.0117248154781344, "learning_rate": 2.248695271300815e-05, "loss": 0.1152, "step": 6051 }, { "epoch": 0.4711790180526495, "grad_norm": 2.9854695477847253, "learning_rate": 2.2481999573144427e-05, "loss": 0.1411, "step": 6052 }, { "epoch": 0.47125687314485915, "grad_norm": 1.256633703281455, "learning_rate": 2.24770462786638e-05, "loss": 0.1322, "step": 6053 }, { "epoch": 0.47133472823706873, "grad_norm": 1.14958108269071, "learning_rate": 2.247209282987483e-05, "loss": 0.1089, "step": 6054 }, { "epoch": 0.4714125833292784, "grad_norm": 1.1817844012689844, "learning_rate": 2.2467139227086098e-05, "loss": 0.1581, "step": 6055 }, { "epoch": 0.471490438421488, "grad_norm": 1.1708849141691364, "learning_rate": 2.2462185470606192e-05, "loss": 0.1472, "step": 6056 }, { "epoch": 0.4715682935136976, "grad_norm": 1.0699021277494802, "learning_rate": 2.2457231560743703e-05, "loss": 0.1261, "step": 6057 }, { "epoch": 0.47164614860590726, "grad_norm": 1.0304156862514846, "learning_rate": 2.245227749780724e-05, "loss": 0.1001, "step": 6058 }, { "epoch": 0.4717240036981169, "grad_norm": 1.1150162776493968, "learning_rate": 2.2447323282105408e-05, "loss": 0.1306, "step": 6059 }, { "epoch": 0.4718018587903265, "grad_norm": 1.1170649002339363, "learning_rate": 2.244236891394684e-05, "loss": 0.1558, "step": 6060 }, { "epoch": 0.47187971388253613, "grad_norm": 1.067596958530957, "learning_rate": 2.2437414393640173e-05, "loss": 0.1158, "step": 6061 }, { "epoch": 0.4719575689747458, "grad_norm": 1.09481806722322, "learning_rate": 2.2432459721494045e-05, "loss": 0.1234, "step": 6062 }, { "epoch": 0.47203542406695537, "grad_norm": 1.189817720620151, "learning_rate": 2.242750489781711e-05, "loss": 0.14, "step": 6063 }, { "epoch": 0.472113279159165, "grad_norm": 1.1320452037301223, "learning_rate": 2.242254992291803e-05, "loss": 0.1239, "step": 6064 }, { "epoch": 0.47219113425137466, "grad_norm": 1.1525236091875322, "learning_rate": 2.2417594797105475e-05, "loss": 0.1195, "step": 6065 }, { "epoch": 0.47226898934358424, "grad_norm": 1.1009640614101466, "learning_rate": 2.241263952068813e-05, "loss": 0.1639, "step": 6066 }, { "epoch": 0.4723468444357939, "grad_norm": 1.356224854624328, "learning_rate": 2.2407684093974682e-05, "loss": 0.1594, "step": 6067 }, { "epoch": 0.4724246995280035, "grad_norm": 1.2012228101743962, "learning_rate": 2.2402728517273834e-05, "loss": 0.1449, "step": 6068 }, { "epoch": 0.4725025546202131, "grad_norm": 1.1200829438567341, "learning_rate": 2.239777279089429e-05, "loss": 0.1282, "step": 6069 }, { "epoch": 0.47258040971242277, "grad_norm": 1.0675452055701233, "learning_rate": 2.2392816915144775e-05, "loss": 0.1386, "step": 6070 }, { "epoch": 0.47265826480463236, "grad_norm": 1.1133895764886925, "learning_rate": 2.238786089033401e-05, "loss": 0.1478, "step": 6071 }, { "epoch": 0.472736119896842, "grad_norm": 1.069439114336819, "learning_rate": 2.238290471677074e-05, "loss": 0.1232, "step": 6072 }, { "epoch": 0.47281397498905164, "grad_norm": 1.0776642176224085, "learning_rate": 2.2377948394763706e-05, "loss": 0.1207, "step": 6073 }, { "epoch": 0.47289183008126123, "grad_norm": 1.2612880106581341, "learning_rate": 2.237299192462166e-05, "loss": 0.1819, "step": 6074 }, { "epoch": 0.4729696851734709, "grad_norm": 1.097051385936589, "learning_rate": 2.2368035306653373e-05, "loss": 0.1059, "step": 6075 }, { "epoch": 0.4730475402656805, "grad_norm": 1.1494334927809435, "learning_rate": 2.2363078541167614e-05, "loss": 0.1118, "step": 6076 }, { "epoch": 0.4731253953578901, "grad_norm": 1.0853816748065592, "learning_rate": 2.2358121628473178e-05, "loss": 0.1156, "step": 6077 }, { "epoch": 0.47320325045009975, "grad_norm": 1.1144300875403335, "learning_rate": 2.2353164568878843e-05, "loss": 0.1571, "step": 6078 }, { "epoch": 0.4732811055423094, "grad_norm": 1.1566482244276806, "learning_rate": 2.2348207362693423e-05, "loss": 0.1444, "step": 6079 }, { "epoch": 0.473358960634519, "grad_norm": 1.0589589088933962, "learning_rate": 2.2343250010225707e-05, "loss": 0.1395, "step": 6080 }, { "epoch": 0.47343681572672863, "grad_norm": 1.194289680036988, "learning_rate": 2.2338292511784548e-05, "loss": 0.1568, "step": 6081 }, { "epoch": 0.4735146708189383, "grad_norm": 1.1385480527327658, "learning_rate": 2.2333334867678747e-05, "loss": 0.1662, "step": 6082 }, { "epoch": 0.47359252591114787, "grad_norm": 1.2472348309316172, "learning_rate": 2.232837707821715e-05, "loss": 0.1593, "step": 6083 }, { "epoch": 0.4736703810033575, "grad_norm": 1.1140206870302682, "learning_rate": 2.2323419143708605e-05, "loss": 0.1485, "step": 6084 }, { "epoch": 0.4737482360955671, "grad_norm": 1.095668564214232, "learning_rate": 2.2318461064461965e-05, "loss": 0.1825, "step": 6085 }, { "epoch": 0.47382609118777674, "grad_norm": 1.1049743266570904, "learning_rate": 2.231350284078611e-05, "loss": 0.1481, "step": 6086 }, { "epoch": 0.4739039462799864, "grad_norm": 1.081589174970284, "learning_rate": 2.2308544472989895e-05, "loss": 0.117, "step": 6087 }, { "epoch": 0.473981801372196, "grad_norm": 1.1395292199356468, "learning_rate": 2.230358596138221e-05, "loss": 0.1255, "step": 6088 }, { "epoch": 0.4740596564644056, "grad_norm": 1.3228993304742227, "learning_rate": 2.2298627306271953e-05, "loss": 0.1988, "step": 6089 }, { "epoch": 0.47413751155661527, "grad_norm": 1.1739400711784898, "learning_rate": 2.2293668507968015e-05, "loss": 0.1646, "step": 6090 }, { "epoch": 0.47421536664882485, "grad_norm": 1.0606898586149114, "learning_rate": 2.228870956677931e-05, "loss": 0.1306, "step": 6091 }, { "epoch": 0.4742932217410345, "grad_norm": 1.107049403810959, "learning_rate": 2.2283750483014756e-05, "loss": 0.1168, "step": 6092 }, { "epoch": 0.47437107683324414, "grad_norm": 1.0641731166339439, "learning_rate": 2.227879125698328e-05, "loss": 0.1357, "step": 6093 }, { "epoch": 0.47444893192545373, "grad_norm": 1.2705384117241054, "learning_rate": 2.2273831888993823e-05, "loss": 0.1795, "step": 6094 }, { "epoch": 0.4745267870176634, "grad_norm": 1.2669591029987286, "learning_rate": 2.226887237935533e-05, "loss": 0.1561, "step": 6095 }, { "epoch": 0.474604642109873, "grad_norm": 1.1123696608985671, "learning_rate": 2.226391272837675e-05, "loss": 0.1381, "step": 6096 }, { "epoch": 0.4746824972020826, "grad_norm": 1.049501809096318, "learning_rate": 2.225895293636705e-05, "loss": 0.1131, "step": 6097 }, { "epoch": 0.47476035229429225, "grad_norm": 1.1583656146078534, "learning_rate": 2.225399300363519e-05, "loss": 0.1837, "step": 6098 }, { "epoch": 0.4748382073865019, "grad_norm": 0.9944340929236719, "learning_rate": 2.224903293049017e-05, "loss": 0.1079, "step": 6099 }, { "epoch": 0.4749160624787115, "grad_norm": 1.0352145582400722, "learning_rate": 2.224407271724097e-05, "loss": 0.1433, "step": 6100 }, { "epoch": 0.4749160624787115, "eval_loss": 0.017421018332242966, "eval_runtime": 166.7023, "eval_samples_per_second": 17.276, "eval_steps_per_second": 0.618, "step": 6100 }, { "epoch": 0.47499391757092113, "grad_norm": 1.1570286058529604, "learning_rate": 2.2239112364196586e-05, "loss": 0.1562, "step": 6101 }, { "epoch": 0.4750717726631307, "grad_norm": 1.135630444074913, "learning_rate": 2.2234151871666025e-05, "loss": 0.127, "step": 6102 }, { "epoch": 0.47514962775534036, "grad_norm": 1.164117155443695, "learning_rate": 2.2229191239958304e-05, "loss": 0.1816, "step": 6103 }, { "epoch": 0.47522748284755, "grad_norm": 1.0312915411882124, "learning_rate": 2.222423046938245e-05, "loss": 0.1307, "step": 6104 }, { "epoch": 0.4753053379397596, "grad_norm": 1.1451738242490777, "learning_rate": 2.221926956024749e-05, "loss": 0.1409, "step": 6105 }, { "epoch": 0.47538319303196924, "grad_norm": 1.09879303336428, "learning_rate": 2.221430851286247e-05, "loss": 0.1214, "step": 6106 }, { "epoch": 0.4754610481241789, "grad_norm": 1.1424083060094576, "learning_rate": 2.220934732753644e-05, "loss": 0.1797, "step": 6107 }, { "epoch": 0.4755389032163885, "grad_norm": 1.1059227659248265, "learning_rate": 2.2204386004578446e-05, "loss": 0.1286, "step": 6108 }, { "epoch": 0.4756167583085981, "grad_norm": 1.0846847691055532, "learning_rate": 2.2199424544297573e-05, "loss": 0.1174, "step": 6109 }, { "epoch": 0.47569461340080776, "grad_norm": 1.1663150508677094, "learning_rate": 2.219446294700289e-05, "loss": 0.1425, "step": 6110 }, { "epoch": 0.47577246849301735, "grad_norm": 1.1092866017032734, "learning_rate": 2.218950121300348e-05, "loss": 0.1224, "step": 6111 }, { "epoch": 0.475850323585227, "grad_norm": 1.1339225937885238, "learning_rate": 2.2184539342608433e-05, "loss": 0.1793, "step": 6112 }, { "epoch": 0.47592817867743664, "grad_norm": 1.1135172370850515, "learning_rate": 2.2179577336126852e-05, "loss": 0.1666, "step": 6113 }, { "epoch": 0.47600603376964623, "grad_norm": 1.0810148414961642, "learning_rate": 2.2174615193867855e-05, "loss": 0.1196, "step": 6114 }, { "epoch": 0.4760838888618559, "grad_norm": 1.0805883021083238, "learning_rate": 2.2169652916140554e-05, "loss": 0.1414, "step": 6115 }, { "epoch": 0.4761617439540655, "grad_norm": 1.1130940997660979, "learning_rate": 2.2164690503254062e-05, "loss": 0.1442, "step": 6116 }, { "epoch": 0.4762395990462751, "grad_norm": 1.1300868212676194, "learning_rate": 2.2159727955517537e-05, "loss": 0.1252, "step": 6117 }, { "epoch": 0.47631745413848475, "grad_norm": 1.0408068747559773, "learning_rate": 2.215476527324011e-05, "loss": 0.1203, "step": 6118 }, { "epoch": 0.4763953092306944, "grad_norm": 1.212727136191162, "learning_rate": 2.2149802456730934e-05, "loss": 0.1523, "step": 6119 }, { "epoch": 0.476473164322904, "grad_norm": 1.1053145368271404, "learning_rate": 2.214483950629917e-05, "loss": 0.1571, "step": 6120 }, { "epoch": 0.47655101941511363, "grad_norm": 1.1195909583108943, "learning_rate": 2.2139876422253983e-05, "loss": 0.1371, "step": 6121 }, { "epoch": 0.4766288745073232, "grad_norm": 1.128078313025199, "learning_rate": 2.2134913204904557e-05, "loss": 0.1605, "step": 6122 }, { "epoch": 0.47670672959953286, "grad_norm": 1.0590847106243924, "learning_rate": 2.2129949854560072e-05, "loss": 0.1041, "step": 6123 }, { "epoch": 0.4767845846917425, "grad_norm": 1.193632077735357, "learning_rate": 2.2124986371529726e-05, "loss": 0.1456, "step": 6124 }, { "epoch": 0.4768624397839521, "grad_norm": 1.208770828572744, "learning_rate": 2.2120022756122716e-05, "loss": 0.1857, "step": 6125 }, { "epoch": 0.47694029487616174, "grad_norm": 1.1955319637085393, "learning_rate": 2.2115059008648246e-05, "loss": 0.1461, "step": 6126 }, { "epoch": 0.4770181499683714, "grad_norm": 1.019339560672434, "learning_rate": 2.211009512941555e-05, "loss": 0.1076, "step": 6127 }, { "epoch": 0.477096005060581, "grad_norm": 1.0381993811852126, "learning_rate": 2.2105131118733844e-05, "loss": 0.1236, "step": 6128 }, { "epoch": 0.4771738601527906, "grad_norm": 1.1234861353753702, "learning_rate": 2.210016697691236e-05, "loss": 0.1713, "step": 6129 }, { "epoch": 0.47725171524500026, "grad_norm": 1.1420026247153974, "learning_rate": 2.209520270426035e-05, "loss": 0.143, "step": 6130 }, { "epoch": 0.47732957033720985, "grad_norm": 1.126391840883815, "learning_rate": 2.2090238301087058e-05, "loss": 0.1245, "step": 6131 }, { "epoch": 0.4774074254294195, "grad_norm": 1.181629552419943, "learning_rate": 2.2085273767701745e-05, "loss": 0.1414, "step": 6132 }, { "epoch": 0.47748528052162914, "grad_norm": 1.0936091523313114, "learning_rate": 2.2080309104413677e-05, "loss": 0.1175, "step": 6133 }, { "epoch": 0.47756313561383873, "grad_norm": 1.1641114734958637, "learning_rate": 2.207534431153213e-05, "loss": 0.1595, "step": 6134 }, { "epoch": 0.4776409907060484, "grad_norm": 1.1129037223345581, "learning_rate": 2.207037938936639e-05, "loss": 0.1356, "step": 6135 }, { "epoch": 0.477718845798258, "grad_norm": 1.1531360759004068, "learning_rate": 2.206541433822574e-05, "loss": 0.1601, "step": 6136 }, { "epoch": 0.4777967008904676, "grad_norm": 1.199944568976019, "learning_rate": 2.2060449158419492e-05, "loss": 0.1451, "step": 6137 }, { "epoch": 0.47787455598267725, "grad_norm": 1.055693080057245, "learning_rate": 2.2055483850256944e-05, "loss": 0.1187, "step": 6138 }, { "epoch": 0.47795241107488684, "grad_norm": 1.1410234287565522, "learning_rate": 2.2050518414047412e-05, "loss": 0.1048, "step": 6139 }, { "epoch": 0.4780302661670965, "grad_norm": 1.1419565036329224, "learning_rate": 2.2045552850100226e-05, "loss": 0.1195, "step": 6140 }, { "epoch": 0.47810812125930613, "grad_norm": 1.0317144810568666, "learning_rate": 2.2040587158724713e-05, "loss": 0.1078, "step": 6141 }, { "epoch": 0.4781859763515157, "grad_norm": 1.1351681978225243, "learning_rate": 2.20356213402302e-05, "loss": 0.1635, "step": 6142 }, { "epoch": 0.47826383144372536, "grad_norm": 1.0163973030155924, "learning_rate": 2.2030655394926055e-05, "loss": 0.1145, "step": 6143 }, { "epoch": 0.478341686535935, "grad_norm": 1.1233578774896384, "learning_rate": 2.202568932312162e-05, "loss": 0.1593, "step": 6144 }, { "epoch": 0.4784195416281446, "grad_norm": 1.0626948884114606, "learning_rate": 2.2020723125126265e-05, "loss": 0.1239, "step": 6145 }, { "epoch": 0.47849739672035424, "grad_norm": 1.184909305623449, "learning_rate": 2.2015756801249356e-05, "loss": 0.1599, "step": 6146 }, { "epoch": 0.4785752518125639, "grad_norm": 1.1315123529360132, "learning_rate": 2.201079035180027e-05, "loss": 0.1271, "step": 6147 }, { "epoch": 0.4786531069047735, "grad_norm": 1.2128634573819848, "learning_rate": 2.2005823777088407e-05, "loss": 0.1226, "step": 6148 }, { "epoch": 0.4787309619969831, "grad_norm": 1.05201906873407, "learning_rate": 2.200085707742314e-05, "loss": 0.1131, "step": 6149 }, { "epoch": 0.47880881708919276, "grad_norm": 1.1213653444251568, "learning_rate": 2.199589025311389e-05, "loss": 0.1675, "step": 6150 }, { "epoch": 0.47880881708919276, "eval_loss": 0.017083166167140007, "eval_runtime": 167.2429, "eval_samples_per_second": 17.22, "eval_steps_per_second": 0.616, "step": 6150 }, { "epoch": 0.47888667218140235, "grad_norm": 1.0601401328583848, "learning_rate": 2.1990923304470055e-05, "loss": 0.1144, "step": 6151 }, { "epoch": 0.478964527273612, "grad_norm": 1.0477828055352059, "learning_rate": 2.198595623180105e-05, "loss": 0.1096, "step": 6152 }, { "epoch": 0.47904238236582164, "grad_norm": 1.1173072152359387, "learning_rate": 2.198098903541632e-05, "loss": 0.1394, "step": 6153 }, { "epoch": 0.47912023745803123, "grad_norm": 1.169971093846991, "learning_rate": 2.1976021715625274e-05, "loss": 0.1369, "step": 6154 }, { "epoch": 0.4791980925502409, "grad_norm": 1.075275385892262, "learning_rate": 2.1971054272737363e-05, "loss": 0.128, "step": 6155 }, { "epoch": 0.47927594764245046, "grad_norm": 1.113160301871187, "learning_rate": 2.1966086707062038e-05, "loss": 0.0955, "step": 6156 }, { "epoch": 0.4793538027346601, "grad_norm": 1.0958564769094123, "learning_rate": 2.1961119018908758e-05, "loss": 0.1545, "step": 6157 }, { "epoch": 0.47943165782686975, "grad_norm": 1.071276342070281, "learning_rate": 2.1956151208586977e-05, "loss": 0.1274, "step": 6158 }, { "epoch": 0.47950951291907934, "grad_norm": 1.1870386959656598, "learning_rate": 2.195118327640617e-05, "loss": 0.1655, "step": 6159 }, { "epoch": 0.479587368011289, "grad_norm": 1.113619616781938, "learning_rate": 2.1946215222675815e-05, "loss": 0.1503, "step": 6160 }, { "epoch": 0.47966522310349863, "grad_norm": 1.072744485684726, "learning_rate": 2.1941247047705398e-05, "loss": 0.1218, "step": 6161 }, { "epoch": 0.4797430781957082, "grad_norm": 1.0172815615906057, "learning_rate": 2.1936278751804423e-05, "loss": 0.1214, "step": 6162 }, { "epoch": 0.47982093328791786, "grad_norm": 1.0548532189632536, "learning_rate": 2.1931310335282375e-05, "loss": 0.1226, "step": 6163 }, { "epoch": 0.4798987883801275, "grad_norm": 1.1089237696354448, "learning_rate": 2.1926341798448777e-05, "loss": 0.1336, "step": 6164 }, { "epoch": 0.4799766434723371, "grad_norm": 1.0349320036265897, "learning_rate": 2.192137314161313e-05, "loss": 0.126, "step": 6165 }, { "epoch": 0.48005449856454674, "grad_norm": 1.1708485698578102, "learning_rate": 2.1916404365084976e-05, "loss": 0.1212, "step": 6166 }, { "epoch": 0.4801323536567564, "grad_norm": 1.0280856279915338, "learning_rate": 2.1911435469173837e-05, "loss": 0.111, "step": 6167 }, { "epoch": 0.48021020874896597, "grad_norm": 1.1704343483061233, "learning_rate": 2.1906466454189252e-05, "loss": 0.1415, "step": 6168 }, { "epoch": 0.4802880638411756, "grad_norm": 1.0335340570558869, "learning_rate": 2.190149732044077e-05, "loss": 0.1326, "step": 6169 }, { "epoch": 0.48036591893338526, "grad_norm": 1.1040035721870913, "learning_rate": 2.1896528068237936e-05, "loss": 0.1594, "step": 6170 }, { "epoch": 0.48044377402559485, "grad_norm": 1.1140889678867882, "learning_rate": 2.189155869789032e-05, "loss": 0.1532, "step": 6171 }, { "epoch": 0.4805216291178045, "grad_norm": 1.1284129196496688, "learning_rate": 2.1886589209707494e-05, "loss": 0.1346, "step": 6172 }, { "epoch": 0.48059948421001414, "grad_norm": 1.1456723159580358, "learning_rate": 2.1881619603999024e-05, "loss": 0.1576, "step": 6173 }, { "epoch": 0.4806773393022237, "grad_norm": 1.2086565451159106, "learning_rate": 2.18766498810745e-05, "loss": 0.1871, "step": 6174 }, { "epoch": 0.48075519439443337, "grad_norm": 1.0849140892936198, "learning_rate": 2.1871680041243503e-05, "loss": 0.1157, "step": 6175 }, { "epoch": 0.48083304948664296, "grad_norm": 1.1406763265672983, "learning_rate": 2.186671008481564e-05, "loss": 0.1394, "step": 6176 }, { "epoch": 0.4809109045788526, "grad_norm": 1.1083083785812031, "learning_rate": 2.186174001210052e-05, "loss": 0.1339, "step": 6177 }, { "epoch": 0.48098875967106225, "grad_norm": 1.056641656674347, "learning_rate": 2.185676982340774e-05, "loss": 0.1526, "step": 6178 }, { "epoch": 0.48106661476327184, "grad_norm": 1.0991799453695956, "learning_rate": 2.1851799519046933e-05, "loss": 0.1456, "step": 6179 }, { "epoch": 0.4811444698554815, "grad_norm": 1.11530932237928, "learning_rate": 2.184682909932771e-05, "loss": 0.1211, "step": 6180 }, { "epoch": 0.4812223249476911, "grad_norm": 1.1144701639170937, "learning_rate": 2.1841858564559725e-05, "loss": 0.1363, "step": 6181 }, { "epoch": 0.4813001800399007, "grad_norm": 1.0513211502410926, "learning_rate": 2.1836887915052612e-05, "loss": 0.126, "step": 6182 }, { "epoch": 0.48137803513211036, "grad_norm": 1.0955917150661314, "learning_rate": 2.1831917151116e-05, "loss": 0.1291, "step": 6183 }, { "epoch": 0.48145589022432, "grad_norm": 1.0247159600837759, "learning_rate": 2.182694627305958e-05, "loss": 0.1203, "step": 6184 }, { "epoch": 0.4815337453165296, "grad_norm": 1.0812355418535293, "learning_rate": 2.1821975281192977e-05, "loss": 0.1466, "step": 6185 }, { "epoch": 0.48161160040873924, "grad_norm": 1.1054308638866817, "learning_rate": 2.1817004175825892e-05, "loss": 0.1225, "step": 6186 }, { "epoch": 0.4816894555009489, "grad_norm": 1.1202023310494043, "learning_rate": 2.1812032957267978e-05, "loss": 0.1436, "step": 6187 }, { "epoch": 0.48176731059315847, "grad_norm": 1.0512951984992907, "learning_rate": 2.1807061625828928e-05, "loss": 0.1315, "step": 6188 }, { "epoch": 0.4818451656853681, "grad_norm": 1.0822189896439154, "learning_rate": 2.180209018181844e-05, "loss": 0.1074, "step": 6189 }, { "epoch": 0.48192302077757776, "grad_norm": 1.1544852915480264, "learning_rate": 2.1797118625546196e-05, "loss": 0.1331, "step": 6190 }, { "epoch": 0.48200087586978735, "grad_norm": 1.108134847888719, "learning_rate": 2.179214695732192e-05, "loss": 0.1376, "step": 6191 }, { "epoch": 0.482078730961997, "grad_norm": 1.0685226418269955, "learning_rate": 2.1787175177455305e-05, "loss": 0.1241, "step": 6192 }, { "epoch": 0.4821565860542066, "grad_norm": 1.0426366902975182, "learning_rate": 2.1782203286256073e-05, "loss": 0.1058, "step": 6193 }, { "epoch": 0.4822344411464162, "grad_norm": 1.1420443665435938, "learning_rate": 2.177723128403396e-05, "loss": 0.1349, "step": 6194 }, { "epoch": 0.48231229623862587, "grad_norm": 1.1656061921748415, "learning_rate": 2.177225917109869e-05, "loss": 0.1549, "step": 6195 }, { "epoch": 0.48239015133083546, "grad_norm": 1.1250232542160816, "learning_rate": 2.1767286947760007e-05, "loss": 0.1232, "step": 6196 }, { "epoch": 0.4824680064230451, "grad_norm": 1.1274734240607827, "learning_rate": 2.176231461432765e-05, "loss": 0.1541, "step": 6197 }, { "epoch": 0.48254586151525475, "grad_norm": 1.07026640269823, "learning_rate": 2.175734217111137e-05, "loss": 0.1168, "step": 6198 }, { "epoch": 0.48262371660746434, "grad_norm": 0.9962420673824597, "learning_rate": 2.1752369618420944e-05, "loss": 0.119, "step": 6199 }, { "epoch": 0.482701571699674, "grad_norm": 1.0223394696178312, "learning_rate": 2.1747396956566122e-05, "loss": 0.1084, "step": 6200 }, { "epoch": 0.482701571699674, "eval_loss": 0.016961487010121346, "eval_runtime": 167.661, "eval_samples_per_second": 17.178, "eval_steps_per_second": 0.614, "step": 6200 }, { "epoch": 0.4827794267918836, "grad_norm": 1.0885831693859844, "learning_rate": 2.1742424185856687e-05, "loss": 0.1341, "step": 6201 }, { "epoch": 0.4828572818840932, "grad_norm": 1.1386881642084437, "learning_rate": 2.1737451306602412e-05, "loss": 0.1567, "step": 6202 }, { "epoch": 0.48293513697630286, "grad_norm": 1.1150282832474467, "learning_rate": 2.173247831911309e-05, "loss": 0.1165, "step": 6203 }, { "epoch": 0.4830129920685125, "grad_norm": 1.1223638260151279, "learning_rate": 2.172750522369851e-05, "loss": 0.1372, "step": 6204 }, { "epoch": 0.4830908471607221, "grad_norm": 1.0960755511848863, "learning_rate": 2.1722532020668473e-05, "loss": 0.1363, "step": 6205 }, { "epoch": 0.48316870225293174, "grad_norm": 1.1167451605636116, "learning_rate": 2.1717558710332792e-05, "loss": 0.165, "step": 6206 }, { "epoch": 0.4832465573451414, "grad_norm": 0.9830917125558418, "learning_rate": 2.1712585293001275e-05, "loss": 0.0963, "step": 6207 }, { "epoch": 0.48332441243735097, "grad_norm": 1.2089919589462033, "learning_rate": 2.1707611768983742e-05, "loss": 0.1581, "step": 6208 }, { "epoch": 0.4834022675295606, "grad_norm": 1.0978489937336582, "learning_rate": 2.1702638138590018e-05, "loss": 0.138, "step": 6209 }, { "epoch": 0.4834801226217702, "grad_norm": 1.1609986875383083, "learning_rate": 2.1697664402129958e-05, "loss": 0.134, "step": 6210 }, { "epoch": 0.48355797771397985, "grad_norm": 1.2010316146293833, "learning_rate": 2.169269055991337e-05, "loss": 0.1278, "step": 6211 }, { "epoch": 0.4836358328061895, "grad_norm": 1.1032018893978321, "learning_rate": 2.1687716612250118e-05, "loss": 0.1232, "step": 6212 }, { "epoch": 0.4837136878983991, "grad_norm": 1.0936410903117892, "learning_rate": 2.1682742559450052e-05, "loss": 0.1358, "step": 6213 }, { "epoch": 0.4837915429906087, "grad_norm": 1.1044496380310929, "learning_rate": 2.1677768401823042e-05, "loss": 0.1153, "step": 6214 }, { "epoch": 0.48386939808281837, "grad_norm": 1.0649911378613528, "learning_rate": 2.167279413967895e-05, "loss": 0.128, "step": 6215 }, { "epoch": 0.48394725317502796, "grad_norm": 1.2064869888055962, "learning_rate": 2.1667819773327635e-05, "loss": 0.1439, "step": 6216 }, { "epoch": 0.4840251082672376, "grad_norm": 1.1466057448148474, "learning_rate": 2.1662845303078992e-05, "loss": 0.155, "step": 6217 }, { "epoch": 0.48410296335944725, "grad_norm": 1.0711498829417936, "learning_rate": 2.1657870729242902e-05, "loss": 0.1237, "step": 6218 }, { "epoch": 0.48418081845165684, "grad_norm": 1.1022366229190734, "learning_rate": 2.1652896052129263e-05, "loss": 0.1138, "step": 6219 }, { "epoch": 0.4842586735438665, "grad_norm": 1.0536067234920499, "learning_rate": 2.1647921272047966e-05, "loss": 0.1097, "step": 6220 }, { "epoch": 0.4843365286360761, "grad_norm": 1.0903692994549818, "learning_rate": 2.1642946389308914e-05, "loss": 0.1198, "step": 6221 }, { "epoch": 0.4844143837282857, "grad_norm": 1.0659743541746889, "learning_rate": 2.163797140422203e-05, "loss": 0.1437, "step": 6222 }, { "epoch": 0.48449223882049536, "grad_norm": 1.1886210851310022, "learning_rate": 2.163299631709723e-05, "loss": 0.1418, "step": 6223 }, { "epoch": 0.484570093912705, "grad_norm": 1.0027458656314836, "learning_rate": 2.1628021128244428e-05, "loss": 0.1155, "step": 6224 }, { "epoch": 0.4846479490049146, "grad_norm": 0.9949288594685753, "learning_rate": 2.1623045837973567e-05, "loss": 0.1159, "step": 6225 }, { "epoch": 0.48472580409712424, "grad_norm": 1.1203424852679291, "learning_rate": 2.161807044659457e-05, "loss": 0.1407, "step": 6226 }, { "epoch": 0.4848036591893339, "grad_norm": 1.1111107540835188, "learning_rate": 2.1613094954417404e-05, "loss": 0.113, "step": 6227 }, { "epoch": 0.48488151428154347, "grad_norm": 1.1076486149793412, "learning_rate": 2.1608119361751996e-05, "loss": 0.1163, "step": 6228 }, { "epoch": 0.4849593693737531, "grad_norm": 1.1037023895608664, "learning_rate": 2.1603143668908313e-05, "loss": 0.1604, "step": 6229 }, { "epoch": 0.4850372244659627, "grad_norm": 1.1306468348619865, "learning_rate": 2.1598167876196318e-05, "loss": 0.1284, "step": 6230 }, { "epoch": 0.48511507955817235, "grad_norm": 1.0948823873864686, "learning_rate": 2.1593191983925968e-05, "loss": 0.1357, "step": 6231 }, { "epoch": 0.485192934650382, "grad_norm": 1.0212938960414704, "learning_rate": 2.1588215992407252e-05, "loss": 0.1089, "step": 6232 }, { "epoch": 0.4852707897425916, "grad_norm": 1.0584207329209225, "learning_rate": 2.1583239901950142e-05, "loss": 0.1133, "step": 6233 }, { "epoch": 0.4853486448348012, "grad_norm": 1.0875230040723305, "learning_rate": 2.157826371286463e-05, "loss": 0.1555, "step": 6234 }, { "epoch": 0.48542649992701087, "grad_norm": 1.073482691183817, "learning_rate": 2.1573287425460712e-05, "loss": 0.1504, "step": 6235 }, { "epoch": 0.48550435501922046, "grad_norm": 1.1846261488348413, "learning_rate": 2.1568311040048377e-05, "loss": 0.1403, "step": 6236 }, { "epoch": 0.4855822101114301, "grad_norm": 1.0363728570850232, "learning_rate": 2.1563334556937634e-05, "loss": 0.1212, "step": 6237 }, { "epoch": 0.48566006520363975, "grad_norm": 1.1837525072157795, "learning_rate": 2.15583579764385e-05, "loss": 0.1193, "step": 6238 }, { "epoch": 0.48573792029584933, "grad_norm": 1.2319064764188958, "learning_rate": 2.1553381298860983e-05, "loss": 0.1596, "step": 6239 }, { "epoch": 0.485815775388059, "grad_norm": 1.1010474532694754, "learning_rate": 2.154840452451512e-05, "loss": 0.1672, "step": 6240 }, { "epoch": 0.4858936304802686, "grad_norm": 1.0060341574842449, "learning_rate": 2.1543427653710936e-05, "loss": 0.1394, "step": 6241 }, { "epoch": 0.4859714855724782, "grad_norm": 1.0258545304595006, "learning_rate": 2.153845068675845e-05, "loss": 0.1269, "step": 6242 }, { "epoch": 0.48604934066468786, "grad_norm": 1.0342933040103965, "learning_rate": 2.1533473623967728e-05, "loss": 0.1211, "step": 6243 }, { "epoch": 0.4861271957568975, "grad_norm": 1.1602974893713405, "learning_rate": 2.152849646564881e-05, "loss": 0.1337, "step": 6244 }, { "epoch": 0.4862050508491071, "grad_norm": 1.0507891159915566, "learning_rate": 2.152351921211174e-05, "loss": 0.1157, "step": 6245 }, { "epoch": 0.48628290594131673, "grad_norm": 1.1026503244324104, "learning_rate": 2.151854186366659e-05, "loss": 0.1393, "step": 6246 }, { "epoch": 0.4863607610335263, "grad_norm": 1.0105665398085055, "learning_rate": 2.151356442062341e-05, "loss": 0.1064, "step": 6247 }, { "epoch": 0.48643861612573597, "grad_norm": 1.091461457355377, "learning_rate": 2.1508586883292294e-05, "loss": 0.1536, "step": 6248 }, { "epoch": 0.4865164712179456, "grad_norm": 1.0882968825482258, "learning_rate": 2.1503609251983293e-05, "loss": 0.1353, "step": 6249 }, { "epoch": 0.4865943263101552, "grad_norm": 1.044754801702687, "learning_rate": 2.149863152700651e-05, "loss": 0.1396, "step": 6250 }, { "epoch": 0.4865943263101552, "eval_loss": 0.016627350822091103, "eval_runtime": 167.5769, "eval_samples_per_second": 17.186, "eval_steps_per_second": 0.615, "step": 6250 }, { "epoch": 0.48667218140236485, "grad_norm": 1.1740562358549833, "learning_rate": 2.1493653708672028e-05, "loss": 0.1271, "step": 6251 }, { "epoch": 0.4867500364945745, "grad_norm": 1.1428517968327552, "learning_rate": 2.1488675797289936e-05, "loss": 0.1754, "step": 6252 }, { "epoch": 0.4868278915867841, "grad_norm": 1.0477824920262044, "learning_rate": 2.148369779317035e-05, "loss": 0.1002, "step": 6253 }, { "epoch": 0.4869057466789937, "grad_norm": 1.1012674745020532, "learning_rate": 2.147871969662336e-05, "loss": 0.1085, "step": 6254 }, { "epoch": 0.48698360177120337, "grad_norm": 1.102580462117319, "learning_rate": 2.1473741507959075e-05, "loss": 0.1682, "step": 6255 }, { "epoch": 0.48706145686341296, "grad_norm": 1.1400010246923717, "learning_rate": 2.1468763227487627e-05, "loss": 0.1275, "step": 6256 }, { "epoch": 0.4871393119556226, "grad_norm": 1.0552198233254262, "learning_rate": 2.1463784855519136e-05, "loss": 0.1328, "step": 6257 }, { "epoch": 0.48721716704783224, "grad_norm": 1.2505280000503354, "learning_rate": 2.145880639236373e-05, "loss": 0.1809, "step": 6258 }, { "epoch": 0.48729502214004183, "grad_norm": 1.06234176572644, "learning_rate": 2.145382783833154e-05, "loss": 0.0936, "step": 6259 }, { "epoch": 0.4873728772322515, "grad_norm": 1.0682239852770536, "learning_rate": 2.1448849193732703e-05, "loss": 0.1226, "step": 6260 }, { "epoch": 0.4874507323244611, "grad_norm": 1.157246165614174, "learning_rate": 2.144387045887738e-05, "loss": 0.1883, "step": 6261 }, { "epoch": 0.4875285874166707, "grad_norm": 1.0944206609522515, "learning_rate": 2.1438891634075713e-05, "loss": 0.1184, "step": 6262 }, { "epoch": 0.48760644250888036, "grad_norm": 1.0805727899747233, "learning_rate": 2.143391271963786e-05, "loss": 0.117, "step": 6263 }, { "epoch": 0.48768429760108994, "grad_norm": 1.0954167766770293, "learning_rate": 2.1428933715873983e-05, "loss": 0.1293, "step": 6264 }, { "epoch": 0.4877621526932996, "grad_norm": 1.1571092324052694, "learning_rate": 2.1423954623094252e-05, "loss": 0.1516, "step": 6265 }, { "epoch": 0.48784000778550923, "grad_norm": 1.164834895488685, "learning_rate": 2.141897544160884e-05, "loss": 0.1375, "step": 6266 }, { "epoch": 0.4879178628777188, "grad_norm": 1.0577872750523225, "learning_rate": 2.1413996171727932e-05, "loss": 0.1233, "step": 6267 }, { "epoch": 0.48799571796992847, "grad_norm": 1.168885550508924, "learning_rate": 2.1409016813761708e-05, "loss": 0.1488, "step": 6268 }, { "epoch": 0.4880735730621381, "grad_norm": 1.2342316382353666, "learning_rate": 2.140403736802036e-05, "loss": 0.1245, "step": 6269 }, { "epoch": 0.4881514281543477, "grad_norm": 1.1537444208141645, "learning_rate": 2.1399057834814076e-05, "loss": 0.1443, "step": 6270 }, { "epoch": 0.48822928324655734, "grad_norm": 1.0879962788475985, "learning_rate": 2.1394078214453074e-05, "loss": 0.1152, "step": 6271 }, { "epoch": 0.488307138338767, "grad_norm": 1.1023940151121319, "learning_rate": 2.1389098507247547e-05, "loss": 0.1265, "step": 6272 }, { "epoch": 0.4883849934309766, "grad_norm": 1.098822685447828, "learning_rate": 2.1384118713507717e-05, "loss": 0.1207, "step": 6273 }, { "epoch": 0.4884628485231862, "grad_norm": 1.0900560834007034, "learning_rate": 2.1379138833543792e-05, "loss": 0.1229, "step": 6274 }, { "epoch": 0.48854070361539587, "grad_norm": 0.9915661820985935, "learning_rate": 2.1374158867666e-05, "loss": 0.1292, "step": 6275 }, { "epoch": 0.48861855870760545, "grad_norm": 1.136605531734703, "learning_rate": 2.1369178816184575e-05, "loss": 0.1412, "step": 6276 }, { "epoch": 0.4886964137998151, "grad_norm": 1.1239366429430986, "learning_rate": 2.1364198679409747e-05, "loss": 0.1315, "step": 6277 }, { "epoch": 0.48877426889202474, "grad_norm": 1.054471531571773, "learning_rate": 2.135921845765175e-05, "loss": 0.1121, "step": 6278 }, { "epoch": 0.48885212398423433, "grad_norm": 1.048226813640681, "learning_rate": 2.1354238151220826e-05, "loss": 0.1194, "step": 6279 }, { "epoch": 0.488929979076444, "grad_norm": 1.1506022524404418, "learning_rate": 2.1349257760427235e-05, "loss": 0.1261, "step": 6280 }, { "epoch": 0.4890078341686536, "grad_norm": 1.1018258077337426, "learning_rate": 2.134427728558123e-05, "loss": 0.1088, "step": 6281 }, { "epoch": 0.4890856892608632, "grad_norm": 1.0580043964070818, "learning_rate": 2.1339296726993074e-05, "loss": 0.1221, "step": 6282 }, { "epoch": 0.48916354435307285, "grad_norm": 1.1218799829998607, "learning_rate": 2.1334316084973014e-05, "loss": 0.1297, "step": 6283 }, { "epoch": 0.48924139944528244, "grad_norm": 0.9652884737522478, "learning_rate": 2.132933535983134e-05, "loss": 0.1385, "step": 6284 }, { "epoch": 0.4893192545374921, "grad_norm": 1.007534513860518, "learning_rate": 2.1324354551878323e-05, "loss": 0.1137, "step": 6285 }, { "epoch": 0.48939710962970173, "grad_norm": 1.1374864338001427, "learning_rate": 2.1319373661424245e-05, "loss": 0.1601, "step": 6286 }, { "epoch": 0.4894749647219113, "grad_norm": 1.1104316362785434, "learning_rate": 2.1314392688779385e-05, "loss": 0.1268, "step": 6287 }, { "epoch": 0.48955281981412097, "grad_norm": 0.9943524850667785, "learning_rate": 2.1309411634254035e-05, "loss": 0.0909, "step": 6288 }, { "epoch": 0.4896306749063306, "grad_norm": 1.0502201707512904, "learning_rate": 2.13044304981585e-05, "loss": 0.1336, "step": 6289 }, { "epoch": 0.4897085299985402, "grad_norm": 1.0620666490486033, "learning_rate": 2.129944928080307e-05, "loss": 0.1023, "step": 6290 }, { "epoch": 0.48978638509074984, "grad_norm": 0.9925768215174284, "learning_rate": 2.129446798249807e-05, "loss": 0.1084, "step": 6291 }, { "epoch": 0.4898642401829595, "grad_norm": 1.0489865898159805, "learning_rate": 2.1289486603553793e-05, "loss": 0.1091, "step": 6292 }, { "epoch": 0.4899420952751691, "grad_norm": 0.9849040801279602, "learning_rate": 2.128450514428056e-05, "loss": 0.0965, "step": 6293 }, { "epoch": 0.4900199503673787, "grad_norm": 1.074754928533498, "learning_rate": 2.1279523604988696e-05, "loss": 0.1357, "step": 6294 }, { "epoch": 0.49009780545958836, "grad_norm": 1.0374095362073152, "learning_rate": 2.1274541985988523e-05, "loss": 0.1104, "step": 6295 }, { "epoch": 0.49017566055179795, "grad_norm": 1.0443142458060262, "learning_rate": 2.1269560287590377e-05, "loss": 0.1305, "step": 6296 }, { "epoch": 0.4902535156440076, "grad_norm": 1.0559459358778025, "learning_rate": 2.12645785101046e-05, "loss": 0.1158, "step": 6297 }, { "epoch": 0.49033137073621724, "grad_norm": 1.129485447069154, "learning_rate": 2.1259596653841515e-05, "loss": 0.1163, "step": 6298 }, { "epoch": 0.49040922582842683, "grad_norm": 1.2122592560442385, "learning_rate": 2.125461471911148e-05, "loss": 0.1655, "step": 6299 }, { "epoch": 0.4904870809206365, "grad_norm": 1.1627360456660316, "learning_rate": 2.1249632706224848e-05, "loss": 0.1362, "step": 6300 }, { "epoch": 0.4904870809206365, "eval_loss": 0.016566293314099312, "eval_runtime": 166.9128, "eval_samples_per_second": 17.255, "eval_steps_per_second": 0.617, "step": 6300 }, { "epoch": 0.49056493601284606, "grad_norm": 1.0853803499503831, "learning_rate": 2.124465061549197e-05, "loss": 0.129, "step": 6301 }, { "epoch": 0.4906427911050557, "grad_norm": 0.9555915718257532, "learning_rate": 2.123966844722322e-05, "loss": 0.1125, "step": 6302 }, { "epoch": 0.49072064619726535, "grad_norm": 1.0148909902833951, "learning_rate": 2.1234686201728937e-05, "loss": 0.1404, "step": 6303 }, { "epoch": 0.49079850128947494, "grad_norm": 0.9859365092517443, "learning_rate": 2.1229703879319515e-05, "loss": 0.1192, "step": 6304 }, { "epoch": 0.4908763563816846, "grad_norm": 0.998635061734174, "learning_rate": 2.122472148030532e-05, "loss": 0.1054, "step": 6305 }, { "epoch": 0.49095421147389423, "grad_norm": 1.0012091211783798, "learning_rate": 2.121973900499674e-05, "loss": 0.1405, "step": 6306 }, { "epoch": 0.4910320665661038, "grad_norm": 1.1314999641403913, "learning_rate": 2.1214756453704144e-05, "loss": 0.1326, "step": 6307 }, { "epoch": 0.49110992165831346, "grad_norm": 1.0434553092417853, "learning_rate": 2.120977382673793e-05, "loss": 0.124, "step": 6308 }, { "epoch": 0.4911877767505231, "grad_norm": 1.014643496678892, "learning_rate": 2.1204791124408495e-05, "loss": 0.0923, "step": 6309 }, { "epoch": 0.4912656318427327, "grad_norm": 1.10495884023826, "learning_rate": 2.1199808347026246e-05, "loss": 0.1231, "step": 6310 }, { "epoch": 0.49134348693494234, "grad_norm": 1.171891026147429, "learning_rate": 2.119482549490157e-05, "loss": 0.1584, "step": 6311 }, { "epoch": 0.491421342027152, "grad_norm": 1.09684323821428, "learning_rate": 2.1189842568344877e-05, "loss": 0.1138, "step": 6312 }, { "epoch": 0.4914991971193616, "grad_norm": 0.9749589985309556, "learning_rate": 2.1184859567666593e-05, "loss": 0.0973, "step": 6313 }, { "epoch": 0.4915770522115712, "grad_norm": 1.189033193122961, "learning_rate": 2.1179876493177115e-05, "loss": 0.1629, "step": 6314 }, { "epoch": 0.49165490730378086, "grad_norm": 0.9914090766855354, "learning_rate": 2.117489334518689e-05, "loss": 0.1157, "step": 6315 }, { "epoch": 0.49173276239599045, "grad_norm": 0.969416642283542, "learning_rate": 2.1169910124006324e-05, "loss": 0.1141, "step": 6316 }, { "epoch": 0.4918106174882001, "grad_norm": 1.0746258707775211, "learning_rate": 2.116492682994586e-05, "loss": 0.1301, "step": 6317 }, { "epoch": 0.4918884725804097, "grad_norm": 1.0837647552271041, "learning_rate": 2.115994346331593e-05, "loss": 0.1094, "step": 6318 }, { "epoch": 0.49196632767261933, "grad_norm": 1.0831107169956276, "learning_rate": 2.115496002442697e-05, "loss": 0.1337, "step": 6319 }, { "epoch": 0.492044182764829, "grad_norm": 1.120064081239538, "learning_rate": 2.1149976513589442e-05, "loss": 0.1675, "step": 6320 }, { "epoch": 0.49212203785703856, "grad_norm": 1.1326200849078711, "learning_rate": 2.1144992931113767e-05, "loss": 0.1244, "step": 6321 }, { "epoch": 0.4921998929492482, "grad_norm": 1.0397996594094234, "learning_rate": 2.1140009277310424e-05, "loss": 0.1113, "step": 6322 }, { "epoch": 0.49227774804145785, "grad_norm": 1.073342623773051, "learning_rate": 2.1135025552489856e-05, "loss": 0.1216, "step": 6323 }, { "epoch": 0.49235560313366744, "grad_norm": 1.0060908127152612, "learning_rate": 2.1130041756962538e-05, "loss": 0.1047, "step": 6324 }, { "epoch": 0.4924334582258771, "grad_norm": 1.0336248665608458, "learning_rate": 2.1125057891038926e-05, "loss": 0.1148, "step": 6325 }, { "epoch": 0.49251131331808673, "grad_norm": 0.9824952462763548, "learning_rate": 2.112007395502949e-05, "loss": 0.1163, "step": 6326 }, { "epoch": 0.4925891684102963, "grad_norm": 1.07990605070506, "learning_rate": 2.1115089949244722e-05, "loss": 0.1339, "step": 6327 }, { "epoch": 0.49266702350250596, "grad_norm": 1.0585182886233804, "learning_rate": 2.1110105873995087e-05, "loss": 0.1248, "step": 6328 }, { "epoch": 0.4927448785947156, "grad_norm": 1.0524214190294283, "learning_rate": 2.110512172959107e-05, "loss": 0.1494, "step": 6329 }, { "epoch": 0.4928227336869252, "grad_norm": 1.1223462029485813, "learning_rate": 2.110013751634317e-05, "loss": 0.1199, "step": 6330 }, { "epoch": 0.49290058877913484, "grad_norm": 1.1796716933607418, "learning_rate": 2.1095153234561874e-05, "loss": 0.1501, "step": 6331 }, { "epoch": 0.4929784438713445, "grad_norm": 1.0585610144013033, "learning_rate": 2.1090168884557675e-05, "loss": 0.1135, "step": 6332 }, { "epoch": 0.4930562989635541, "grad_norm": 1.1875108225223026, "learning_rate": 2.1085184466641078e-05, "loss": 0.1637, "step": 6333 }, { "epoch": 0.4931341540557637, "grad_norm": 0.9929021643235313, "learning_rate": 2.1080199981122592e-05, "loss": 0.1037, "step": 6334 }, { "epoch": 0.4932120091479733, "grad_norm": 1.0952928536640003, "learning_rate": 2.1075215428312722e-05, "loss": 0.1308, "step": 6335 }, { "epoch": 0.49328986424018295, "grad_norm": 1.0576823716230932, "learning_rate": 2.1070230808521994e-05, "loss": 0.1162, "step": 6336 }, { "epoch": 0.4933677193323926, "grad_norm": 1.0187767111351214, "learning_rate": 2.1065246122060902e-05, "loss": 0.0977, "step": 6337 }, { "epoch": 0.4934455744246022, "grad_norm": 1.0223502633894666, "learning_rate": 2.1060261369239994e-05, "loss": 0.114, "step": 6338 }, { "epoch": 0.49352342951681183, "grad_norm": 1.1522611024286271, "learning_rate": 2.1055276550369784e-05, "loss": 0.1464, "step": 6339 }, { "epoch": 0.4936012846090215, "grad_norm": 1.1301112591767863, "learning_rate": 2.1050291665760808e-05, "loss": 0.1286, "step": 6340 }, { "epoch": 0.49367913970123106, "grad_norm": 1.0855492686054307, "learning_rate": 2.1045306715723595e-05, "loss": 0.1351, "step": 6341 }, { "epoch": 0.4937569947934407, "grad_norm": 1.0502989283858544, "learning_rate": 2.1040321700568686e-05, "loss": 0.101, "step": 6342 }, { "epoch": 0.49383484988565035, "grad_norm": 1.0809743735962585, "learning_rate": 2.1035336620606624e-05, "loss": 0.1438, "step": 6343 }, { "epoch": 0.49391270497785994, "grad_norm": 1.0469415670916713, "learning_rate": 2.1030351476147964e-05, "loss": 0.1241, "step": 6344 }, { "epoch": 0.4939905600700696, "grad_norm": 1.059837245154368, "learning_rate": 2.1025366267503244e-05, "loss": 0.1495, "step": 6345 }, { "epoch": 0.49406841516227923, "grad_norm": 1.1510219518011917, "learning_rate": 2.1020380994983028e-05, "loss": 0.177, "step": 6346 }, { "epoch": 0.4941462702544888, "grad_norm": 0.9531302494423906, "learning_rate": 2.1015395658897865e-05, "loss": 0.1175, "step": 6347 }, { "epoch": 0.49422412534669846, "grad_norm": 1.116291328383493, "learning_rate": 2.101041025955834e-05, "loss": 0.1184, "step": 6348 }, { "epoch": 0.4943019804389081, "grad_norm": 1.0874929958650843, "learning_rate": 2.1005424797274996e-05, "loss": 0.1057, "step": 6349 }, { "epoch": 0.4943798355311177, "grad_norm": 1.0519451314678565, "learning_rate": 2.100043927235841e-05, "loss": 0.1217, "step": 6350 }, { "epoch": 0.4943798355311177, "eval_loss": 0.016125090420246124, "eval_runtime": 167.0721, "eval_samples_per_second": 17.238, "eval_steps_per_second": 0.617, "step": 6350 }, { "epoch": 0.49445769062332734, "grad_norm": 1.0395248783108961, "learning_rate": 2.099545368511916e-05, "loss": 0.124, "step": 6351 }, { "epoch": 0.494535545715537, "grad_norm": 0.9706895569494369, "learning_rate": 2.0990468035867824e-05, "loss": 0.1101, "step": 6352 }, { "epoch": 0.4946134008077466, "grad_norm": 1.1321242353889782, "learning_rate": 2.0985482324914996e-05, "loss": 0.1408, "step": 6353 }, { "epoch": 0.4946912558999562, "grad_norm": 1.258609632554781, "learning_rate": 2.098049655257125e-05, "loss": 0.1416, "step": 6354 }, { "epoch": 0.4947691109921658, "grad_norm": 1.0286537971856822, "learning_rate": 2.0975510719147164e-05, "loss": 0.1138, "step": 6355 }, { "epoch": 0.49484696608437545, "grad_norm": 1.116707112395698, "learning_rate": 2.0970524824953354e-05, "loss": 0.1505, "step": 6356 }, { "epoch": 0.4949248211765851, "grad_norm": 1.091438310875366, "learning_rate": 2.0965538870300405e-05, "loss": 0.1081, "step": 6357 }, { "epoch": 0.4950026762687947, "grad_norm": 0.955786169591438, "learning_rate": 2.096055285549893e-05, "loss": 0.1013, "step": 6358 }, { "epoch": 0.49508053136100433, "grad_norm": 1.0627211500105336, "learning_rate": 2.0955566780859525e-05, "loss": 0.1609, "step": 6359 }, { "epoch": 0.495158386453214, "grad_norm": 1.144055149778813, "learning_rate": 2.0950580646692797e-05, "loss": 0.1221, "step": 6360 }, { "epoch": 0.49523624154542356, "grad_norm": 1.1667105158497604, "learning_rate": 2.0945594453309365e-05, "loss": 0.136, "step": 6361 }, { "epoch": 0.4953140966376332, "grad_norm": 0.9832423304232852, "learning_rate": 2.0940608201019844e-05, "loss": 0.1138, "step": 6362 }, { "epoch": 0.49539195172984285, "grad_norm": 1.1982459589460837, "learning_rate": 2.0935621890134853e-05, "loss": 0.1329, "step": 6363 }, { "epoch": 0.49546980682205244, "grad_norm": 1.0708065322333795, "learning_rate": 2.093063552096502e-05, "loss": 0.1191, "step": 6364 }, { "epoch": 0.4955476619142621, "grad_norm": 1.1329231595101477, "learning_rate": 2.092564909382096e-05, "loss": 0.1483, "step": 6365 }, { "epoch": 0.4956255170064717, "grad_norm": 1.0680563461398187, "learning_rate": 2.0920662609013315e-05, "loss": 0.1195, "step": 6366 }, { "epoch": 0.4957033720986813, "grad_norm": 1.1187463895362912, "learning_rate": 2.091567606685272e-05, "loss": 0.1442, "step": 6367 }, { "epoch": 0.49578122719089096, "grad_norm": 1.0840427534416712, "learning_rate": 2.0910689467649807e-05, "loss": 0.1326, "step": 6368 }, { "epoch": 0.4958590822831006, "grad_norm": 0.9938374642836579, "learning_rate": 2.090570281171523e-05, "loss": 0.116, "step": 6369 }, { "epoch": 0.4959369373753102, "grad_norm": 1.0541039195708297, "learning_rate": 2.090071609935961e-05, "loss": 0.137, "step": 6370 }, { "epoch": 0.49601479246751984, "grad_norm": 1.0792053055610091, "learning_rate": 2.089572933089362e-05, "loss": 0.1325, "step": 6371 }, { "epoch": 0.4960926475597294, "grad_norm": 1.265513794872406, "learning_rate": 2.08907425066279e-05, "loss": 0.1407, "step": 6372 }, { "epoch": 0.49617050265193907, "grad_norm": 1.0160119706076935, "learning_rate": 2.0885755626873112e-05, "loss": 0.1037, "step": 6373 }, { "epoch": 0.4962483577441487, "grad_norm": 1.0995462363032957, "learning_rate": 2.0880768691939908e-05, "loss": 0.1527, "step": 6374 }, { "epoch": 0.4963262128363583, "grad_norm": 1.1001346923819566, "learning_rate": 2.087578170213895e-05, "loss": 0.149, "step": 6375 }, { "epoch": 0.49640406792856795, "grad_norm": 1.023859162992651, "learning_rate": 2.087079465778092e-05, "loss": 0.1209, "step": 6376 }, { "epoch": 0.4964819230207776, "grad_norm": 1.0226531329529382, "learning_rate": 2.0865807559176473e-05, "loss": 0.1103, "step": 6377 }, { "epoch": 0.4965597781129872, "grad_norm": 1.061078079660978, "learning_rate": 2.086082040663628e-05, "loss": 0.1419, "step": 6378 }, { "epoch": 0.4966376332051968, "grad_norm": 0.9969870201890715, "learning_rate": 2.0855833200471026e-05, "loss": 0.1068, "step": 6379 }, { "epoch": 0.49671548829740647, "grad_norm": 0.9861441285667342, "learning_rate": 2.085084594099138e-05, "loss": 0.1031, "step": 6380 }, { "epoch": 0.49679334338961606, "grad_norm": 1.060400128481258, "learning_rate": 2.0845858628508037e-05, "loss": 0.144, "step": 6381 }, { "epoch": 0.4968711984818257, "grad_norm": 1.0369581818617837, "learning_rate": 2.0840871263331684e-05, "loss": 0.1075, "step": 6382 }, { "epoch": 0.49694905357403535, "grad_norm": 1.0835874706899569, "learning_rate": 2.0835883845772987e-05, "loss": 0.136, "step": 6383 }, { "epoch": 0.49702690866624494, "grad_norm": 1.145153084557178, "learning_rate": 2.0830896376142666e-05, "loss": 0.1468, "step": 6384 }, { "epoch": 0.4971047637584546, "grad_norm": 1.0170129580311482, "learning_rate": 2.08259088547514e-05, "loss": 0.1448, "step": 6385 }, { "epoch": 0.4971826188506642, "grad_norm": 1.1560727746607093, "learning_rate": 2.082092128190991e-05, "loss": 0.176, "step": 6386 }, { "epoch": 0.4972604739428738, "grad_norm": 1.0515792123414172, "learning_rate": 2.081593365792887e-05, "loss": 0.1585, "step": 6387 }, { "epoch": 0.49733832903508346, "grad_norm": 1.0629449701442064, "learning_rate": 2.0810945983118996e-05, "loss": 0.1423, "step": 6388 }, { "epoch": 0.49741618412729305, "grad_norm": 1.0350806999248106, "learning_rate": 2.0805958257791007e-05, "loss": 0.1234, "step": 6389 }, { "epoch": 0.4974940392195027, "grad_norm": 1.0225691283933493, "learning_rate": 2.08009704822556e-05, "loss": 0.1265, "step": 6390 }, { "epoch": 0.49757189431171234, "grad_norm": 1.030425606418306, "learning_rate": 2.0795982656823503e-05, "loss": 0.1341, "step": 6391 }, { "epoch": 0.4976497494039219, "grad_norm": 1.0309171474613237, "learning_rate": 2.079099478180543e-05, "loss": 0.1043, "step": 6392 }, { "epoch": 0.49772760449613157, "grad_norm": 1.0540761719696619, "learning_rate": 2.0786006857512087e-05, "loss": 0.1278, "step": 6393 }, { "epoch": 0.4978054595883412, "grad_norm": 0.9485218027364916, "learning_rate": 2.078101888425422e-05, "loss": 0.0937, "step": 6394 }, { "epoch": 0.4978833146805508, "grad_norm": 1.073638231336169, "learning_rate": 2.077603086234255e-05, "loss": 0.1292, "step": 6395 }, { "epoch": 0.49796116977276045, "grad_norm": 1.0693723006420008, "learning_rate": 2.07710427920878e-05, "loss": 0.1338, "step": 6396 }, { "epoch": 0.4980390248649701, "grad_norm": 1.1818620797642294, "learning_rate": 2.076605467380071e-05, "loss": 0.1299, "step": 6397 }, { "epoch": 0.4981168799571797, "grad_norm": 1.0154640055804343, "learning_rate": 2.076106650779201e-05, "loss": 0.1187, "step": 6398 }, { "epoch": 0.4981947350493893, "grad_norm": 1.1118871635346554, "learning_rate": 2.075607829437245e-05, "loss": 0.1043, "step": 6399 }, { "epoch": 0.49827259014159897, "grad_norm": 1.0405911351486063, "learning_rate": 2.075109003385277e-05, "loss": 0.1019, "step": 6400 }, { "epoch": 0.49827259014159897, "eval_loss": 0.01586207188665867, "eval_runtime": 162.3879, "eval_samples_per_second": 17.735, "eval_steps_per_second": 0.634, "step": 6400 }, { "epoch": 0.49835044523380856, "grad_norm": 1.0624206148761606, "learning_rate": 2.0746101726543704e-05, "loss": 0.113, "step": 6401 }, { "epoch": 0.4984283003260182, "grad_norm": 1.0914404250370866, "learning_rate": 2.074111337275601e-05, "loss": 0.106, "step": 6402 }, { "epoch": 0.49850615541822785, "grad_norm": 1.0111794719935292, "learning_rate": 2.073612497280043e-05, "loss": 0.1112, "step": 6403 }, { "epoch": 0.49858401051043744, "grad_norm": 1.0896685247166653, "learning_rate": 2.0731136526987735e-05, "loss": 0.1288, "step": 6404 }, { "epoch": 0.4986618656026471, "grad_norm": 0.9923626264270705, "learning_rate": 2.0726148035628668e-05, "loss": 0.1114, "step": 6405 }, { "epoch": 0.4987397206948567, "grad_norm": 1.1642023564564212, "learning_rate": 2.072115949903399e-05, "loss": 0.1515, "step": 6406 }, { "epoch": 0.4988175757870663, "grad_norm": 1.0763982402132608, "learning_rate": 2.0716170917514465e-05, "loss": 0.1372, "step": 6407 }, { "epoch": 0.49889543087927596, "grad_norm": 1.041817669543859, "learning_rate": 2.071118229138086e-05, "loss": 0.1253, "step": 6408 }, { "epoch": 0.49897328597148555, "grad_norm": 1.0767471614822344, "learning_rate": 2.0706193620943935e-05, "loss": 0.1227, "step": 6409 }, { "epoch": 0.4990511410636952, "grad_norm": 1.0618230667049984, "learning_rate": 2.070120490651447e-05, "loss": 0.1405, "step": 6410 }, { "epoch": 0.49912899615590484, "grad_norm": 1.0478551703192194, "learning_rate": 2.069621614840324e-05, "loss": 0.1177, "step": 6411 }, { "epoch": 0.4992068512481144, "grad_norm": 1.1241887208884818, "learning_rate": 2.0691227346921012e-05, "loss": 0.1698, "step": 6412 }, { "epoch": 0.49928470634032407, "grad_norm": 1.0404028888273105, "learning_rate": 2.0686238502378567e-05, "loss": 0.1265, "step": 6413 }, { "epoch": 0.4993625614325337, "grad_norm": 1.0593894339716725, "learning_rate": 2.0681249615086688e-05, "loss": 0.103, "step": 6414 }, { "epoch": 0.4994404165247433, "grad_norm": 0.9836956108626367, "learning_rate": 2.067626068535617e-05, "loss": 0.124, "step": 6415 }, { "epoch": 0.49951827161695295, "grad_norm": 1.071452690468535, "learning_rate": 2.0671271713497783e-05, "loss": 0.1277, "step": 6416 }, { "epoch": 0.4995961267091626, "grad_norm": 1.0562149213677772, "learning_rate": 2.0666282699822324e-05, "loss": 0.1562, "step": 6417 }, { "epoch": 0.4996739818013722, "grad_norm": 1.1442119150293524, "learning_rate": 2.0661293644640587e-05, "loss": 0.0994, "step": 6418 }, { "epoch": 0.4997518368935818, "grad_norm": 1.1349260908924432, "learning_rate": 2.065630454826336e-05, "loss": 0.1427, "step": 6419 }, { "epoch": 0.49982969198579147, "grad_norm": 1.199042826429388, "learning_rate": 2.0651315411001454e-05, "loss": 0.186, "step": 6420 }, { "epoch": 0.49990754707800106, "grad_norm": 1.0377841357495845, "learning_rate": 2.0646326233165646e-05, "loss": 0.1406, "step": 6421 }, { "epoch": 0.4999854021702107, "grad_norm": 0.9788856726691794, "learning_rate": 2.064133701506676e-05, "loss": 0.1189, "step": 6422 }, { "epoch": 0.5000632572624203, "grad_norm": 0.9646299855184883, "learning_rate": 2.063634775701559e-05, "loss": 0.1064, "step": 6423 }, { "epoch": 0.50014111235463, "grad_norm": 1.0136688562773437, "learning_rate": 2.0631358459322947e-05, "loss": 0.116, "step": 6424 }, { "epoch": 0.5002189674468396, "grad_norm": 1.0624174720404795, "learning_rate": 2.062636912229964e-05, "loss": 0.1186, "step": 6425 }, { "epoch": 0.5002968225390492, "grad_norm": 1.0294234380849325, "learning_rate": 2.0621379746256482e-05, "loss": 0.1243, "step": 6426 }, { "epoch": 0.5003746776312589, "grad_norm": 1.0115748521634225, "learning_rate": 2.0616390331504278e-05, "loss": 0.1323, "step": 6427 }, { "epoch": 0.5004525327234685, "grad_norm": 1.0251034207501317, "learning_rate": 2.061140087835386e-05, "loss": 0.1087, "step": 6428 }, { "epoch": 0.500530387815678, "grad_norm": 1.0690939113334685, "learning_rate": 2.0606411387116046e-05, "loss": 0.1599, "step": 6429 }, { "epoch": 0.5006082429078877, "grad_norm": 1.0076621702084163, "learning_rate": 2.0601421858101648e-05, "loss": 0.1256, "step": 6430 }, { "epoch": 0.5006860980000973, "grad_norm": 0.9673787679207325, "learning_rate": 2.0596432291621493e-05, "loss": 0.1221, "step": 6431 }, { "epoch": 0.5007639530923069, "grad_norm": 1.1355251359579894, "learning_rate": 2.059144268798641e-05, "loss": 0.1264, "step": 6432 }, { "epoch": 0.5008418081845166, "grad_norm": 0.9879277880339276, "learning_rate": 2.058645304750723e-05, "loss": 0.1043, "step": 6433 }, { "epoch": 0.5009196632767262, "grad_norm": 1.036766158542567, "learning_rate": 2.0581463370494784e-05, "loss": 0.126, "step": 6434 }, { "epoch": 0.5009975183689358, "grad_norm": 0.9573876077115415, "learning_rate": 2.05764736572599e-05, "loss": 0.1188, "step": 6435 }, { "epoch": 0.5010753734611454, "grad_norm": 1.0501100635602683, "learning_rate": 2.057148390811342e-05, "loss": 0.1193, "step": 6436 }, { "epoch": 0.5011532285533551, "grad_norm": 1.1162894404438448, "learning_rate": 2.0566494123366172e-05, "loss": 0.1348, "step": 6437 }, { "epoch": 0.5012310836455647, "grad_norm": 1.0247350157477075, "learning_rate": 2.0561504303329008e-05, "loss": 0.115, "step": 6438 }, { "epoch": 0.5013089387377743, "grad_norm": 1.0930740322964871, "learning_rate": 2.0556514448312764e-05, "loss": 0.1544, "step": 6439 }, { "epoch": 0.501386793829984, "grad_norm": 1.059649463481374, "learning_rate": 2.0551524558628285e-05, "loss": 0.1329, "step": 6440 }, { "epoch": 0.5014646489221936, "grad_norm": 0.989013531068548, "learning_rate": 2.054653463458642e-05, "loss": 0.1015, "step": 6441 }, { "epoch": 0.5015425040144031, "grad_norm": 1.0128933543589298, "learning_rate": 2.0541544676498012e-05, "loss": 0.105, "step": 6442 }, { "epoch": 0.5016203591066128, "grad_norm": 1.1704571863197273, "learning_rate": 2.0536554684673923e-05, "loss": 0.121, "step": 6443 }, { "epoch": 0.5016982141988224, "grad_norm": 1.0103406095416019, "learning_rate": 2.0531564659424995e-05, "loss": 0.1229, "step": 6444 }, { "epoch": 0.501776069291032, "grad_norm": 1.0718275094857992, "learning_rate": 2.0526574601062087e-05, "loss": 0.1284, "step": 6445 }, { "epoch": 0.5018539243832417, "grad_norm": 1.0957105362509088, "learning_rate": 2.0521584509896056e-05, "loss": 0.1334, "step": 6446 }, { "epoch": 0.5019317794754513, "grad_norm": 1.0728006636802605, "learning_rate": 2.0516594386237764e-05, "loss": 0.1133, "step": 6447 }, { "epoch": 0.5020096345676609, "grad_norm": 1.0690927740034404, "learning_rate": 2.0511604230398067e-05, "loss": 0.1178, "step": 6448 }, { "epoch": 0.5020874896598706, "grad_norm": 0.9830100210181004, "learning_rate": 2.050661404268784e-05, "loss": 0.1183, "step": 6449 }, { "epoch": 0.5021653447520802, "grad_norm": 1.0425480620118714, "learning_rate": 2.050162382341793e-05, "loss": 0.1098, "step": 6450 }, { "epoch": 0.5021653447520802, "eval_loss": 0.015572091564536095, "eval_runtime": 162.7364, "eval_samples_per_second": 17.697, "eval_steps_per_second": 0.633, "step": 6450 }, { "epoch": 0.5022431998442898, "grad_norm": 0.9837023132125311, "learning_rate": 2.0496633572899218e-05, "loss": 0.1281, "step": 6451 }, { "epoch": 0.5023210549364995, "grad_norm": 1.011974582687379, "learning_rate": 2.0491643291442563e-05, "loss": 0.1172, "step": 6452 }, { "epoch": 0.5023989100287091, "grad_norm": 0.9779737293525652, "learning_rate": 2.0486652979358854e-05, "loss": 0.1091, "step": 6453 }, { "epoch": 0.5024767651209187, "grad_norm": 1.0366453989283568, "learning_rate": 2.0481662636958937e-05, "loss": 0.1262, "step": 6454 }, { "epoch": 0.5025546202131284, "grad_norm": 1.1253696845211578, "learning_rate": 2.047667226455371e-05, "loss": 0.1362, "step": 6455 }, { "epoch": 0.502632475305338, "grad_norm": 1.0386008179855304, "learning_rate": 2.0471681862454036e-05, "loss": 0.1346, "step": 6456 }, { "epoch": 0.5027103303975475, "grad_norm": 0.9752438470668275, "learning_rate": 2.04666914309708e-05, "loss": 0.1131, "step": 6457 }, { "epoch": 0.5027881854897572, "grad_norm": 1.0343070396660068, "learning_rate": 2.046170097041489e-05, "loss": 0.1382, "step": 6458 }, { "epoch": 0.5028660405819668, "grad_norm": 1.1007424739511456, "learning_rate": 2.0456710481097178e-05, "loss": 0.1693, "step": 6459 }, { "epoch": 0.5029438956741764, "grad_norm": 1.0686638058590303, "learning_rate": 2.0451719963328546e-05, "loss": 0.1212, "step": 6460 }, { "epoch": 0.5030217507663861, "grad_norm": 1.0679550568853255, "learning_rate": 2.0446729417419884e-05, "loss": 0.1074, "step": 6461 }, { "epoch": 0.5030996058585957, "grad_norm": 0.9827128705777174, "learning_rate": 2.0441738843682077e-05, "loss": 0.1046, "step": 6462 }, { "epoch": 0.5031774609508053, "grad_norm": 1.0322110401637858, "learning_rate": 2.0436748242426022e-05, "loss": 0.1309, "step": 6463 }, { "epoch": 0.503255316043015, "grad_norm": 0.986120008221225, "learning_rate": 2.0431757613962598e-05, "loss": 0.112, "step": 6464 }, { "epoch": 0.5033331711352246, "grad_norm": 0.9683147007109492, "learning_rate": 2.0426766958602703e-05, "loss": 0.1137, "step": 6465 }, { "epoch": 0.5034110262274342, "grad_norm": 1.0357147354157836, "learning_rate": 2.0421776276657236e-05, "loss": 0.1163, "step": 6466 }, { "epoch": 0.5034888813196439, "grad_norm": 1.0006742422929307, "learning_rate": 2.0416785568437092e-05, "loss": 0.1079, "step": 6467 }, { "epoch": 0.5035667364118535, "grad_norm": 1.0706267547970916, "learning_rate": 2.0411794834253165e-05, "loss": 0.1335, "step": 6468 }, { "epoch": 0.503644591504063, "grad_norm": 1.0976522616507964, "learning_rate": 2.0406804074416354e-05, "loss": 0.1436, "step": 6469 }, { "epoch": 0.5037224465962726, "grad_norm": 1.0628906602701853, "learning_rate": 2.040181328923756e-05, "loss": 0.1172, "step": 6470 }, { "epoch": 0.5038003016884823, "grad_norm": 0.9731704181593398, "learning_rate": 2.0396822479027687e-05, "loss": 0.1321, "step": 6471 }, { "epoch": 0.5038781567806919, "grad_norm": 1.0201609683955126, "learning_rate": 2.039183164409764e-05, "loss": 0.1312, "step": 6472 }, { "epoch": 0.5039560118729015, "grad_norm": 0.9480147796303082, "learning_rate": 2.0386840784758325e-05, "loss": 0.1101, "step": 6473 }, { "epoch": 0.5040338669651112, "grad_norm": 1.064918434246366, "learning_rate": 2.0381849901320647e-05, "loss": 0.1384, "step": 6474 }, { "epoch": 0.5041117220573208, "grad_norm": 1.0467702215739516, "learning_rate": 2.0376858994095512e-05, "loss": 0.1047, "step": 6475 }, { "epoch": 0.5041895771495304, "grad_norm": 0.989049645223862, "learning_rate": 2.0371868063393836e-05, "loss": 0.1069, "step": 6476 }, { "epoch": 0.5042674322417401, "grad_norm": 1.0320036542873887, "learning_rate": 2.036687710952653e-05, "loss": 0.1177, "step": 6477 }, { "epoch": 0.5043452873339497, "grad_norm": 0.9573027914655396, "learning_rate": 2.0361886132804497e-05, "loss": 0.1262, "step": 6478 }, { "epoch": 0.5044231424261593, "grad_norm": 0.969912130104295, "learning_rate": 2.0356895133538662e-05, "loss": 0.1298, "step": 6479 }, { "epoch": 0.504500997518369, "grad_norm": 0.9842588370140543, "learning_rate": 2.0351904112039936e-05, "loss": 0.1147, "step": 6480 }, { "epoch": 0.5045788526105786, "grad_norm": 1.0697795565056727, "learning_rate": 2.0346913068619246e-05, "loss": 0.1381, "step": 6481 }, { "epoch": 0.5046567077027881, "grad_norm": 1.0066609540833475, "learning_rate": 2.03419220035875e-05, "loss": 0.1119, "step": 6482 }, { "epoch": 0.5047345627949978, "grad_norm": 1.0639425773229938, "learning_rate": 2.0336930917255615e-05, "loss": 0.1335, "step": 6483 }, { "epoch": 0.5048124178872074, "grad_norm": 1.0466277962897483, "learning_rate": 2.0331939809934518e-05, "loss": 0.1306, "step": 6484 }, { "epoch": 0.504890272979417, "grad_norm": 1.048211669842523, "learning_rate": 2.0326948681935135e-05, "loss": 0.1118, "step": 6485 }, { "epoch": 0.5049681280716267, "grad_norm": 0.9786532529522608, "learning_rate": 2.0321957533568383e-05, "loss": 0.1231, "step": 6486 }, { "epoch": 0.5050459831638363, "grad_norm": 1.0040044480518076, "learning_rate": 2.0316966365145193e-05, "loss": 0.142, "step": 6487 }, { "epoch": 0.5051238382560459, "grad_norm": 1.0402964563148114, "learning_rate": 2.0311975176976482e-05, "loss": 0.1134, "step": 6488 }, { "epoch": 0.5052016933482556, "grad_norm": 1.094879069356419, "learning_rate": 2.030698396937319e-05, "loss": 0.1392, "step": 6489 }, { "epoch": 0.5052795484404652, "grad_norm": 0.980731915831568, "learning_rate": 2.0301992742646238e-05, "loss": 0.0909, "step": 6490 }, { "epoch": 0.5053574035326748, "grad_norm": 1.018567928076907, "learning_rate": 2.0297001497106562e-05, "loss": 0.1273, "step": 6491 }, { "epoch": 0.5054352586248845, "grad_norm": 1.2245744499929736, "learning_rate": 2.0292010233065085e-05, "loss": 0.1303, "step": 6492 }, { "epoch": 0.5055131137170941, "grad_norm": 1.1176257922166797, "learning_rate": 2.028701895083274e-05, "loss": 0.1639, "step": 6493 }, { "epoch": 0.5055909688093037, "grad_norm": 1.0090047626461638, "learning_rate": 2.0282027650720462e-05, "loss": 0.1219, "step": 6494 }, { "epoch": 0.5056688239015134, "grad_norm": 1.1586853737066618, "learning_rate": 2.027703633303919e-05, "loss": 0.1663, "step": 6495 }, { "epoch": 0.5057466789937229, "grad_norm": 1.0585170007725684, "learning_rate": 2.0272044998099863e-05, "loss": 0.1344, "step": 6496 }, { "epoch": 0.5058245340859325, "grad_norm": 1.0143800351077366, "learning_rate": 2.0267053646213405e-05, "loss": 0.112, "step": 6497 }, { "epoch": 0.5059023891781422, "grad_norm": 1.0741785170492877, "learning_rate": 2.0262062277690756e-05, "loss": 0.1227, "step": 6498 }, { "epoch": 0.5059802442703518, "grad_norm": 0.9364740149712494, "learning_rate": 2.0257070892842865e-05, "loss": 0.1008, "step": 6499 }, { "epoch": 0.5060580993625614, "grad_norm": 1.0155259506957506, "learning_rate": 2.0252079491980664e-05, "loss": 0.1104, "step": 6500 }, { "epoch": 0.5060580993625614, "eval_loss": 0.015347495675086975, "eval_runtime": 162.5293, "eval_samples_per_second": 17.72, "eval_steps_per_second": 0.634, "step": 6500 }, { "epoch": 0.5061359544547711, "grad_norm": 1.0695236596065554, "learning_rate": 2.02470880754151e-05, "loss": 0.1249, "step": 6501 }, { "epoch": 0.5062138095469807, "grad_norm": 1.0685566590183344, "learning_rate": 2.0242096643457106e-05, "loss": 0.1487, "step": 6502 }, { "epoch": 0.5062916646391903, "grad_norm": 1.011026889583109, "learning_rate": 2.023710519641763e-05, "loss": 0.1009, "step": 6503 }, { "epoch": 0.5063695197314, "grad_norm": 1.0004976407959632, "learning_rate": 2.023211373460761e-05, "loss": 0.0988, "step": 6504 }, { "epoch": 0.5064473748236096, "grad_norm": 1.1233637716490523, "learning_rate": 2.0227122258338e-05, "loss": 0.1412, "step": 6505 }, { "epoch": 0.5065252299158192, "grad_norm": 1.0142761401478224, "learning_rate": 2.022213076791974e-05, "loss": 0.0864, "step": 6506 }, { "epoch": 0.5066030850080288, "grad_norm": 1.0412347564579194, "learning_rate": 2.0217139263663782e-05, "loss": 0.1302, "step": 6507 }, { "epoch": 0.5066809401002385, "grad_norm": 1.0307872310500277, "learning_rate": 2.0212147745881063e-05, "loss": 0.1168, "step": 6508 }, { "epoch": 0.506758795192448, "grad_norm": 1.0609207338808124, "learning_rate": 2.0207156214882536e-05, "loss": 0.1447, "step": 6509 }, { "epoch": 0.5068366502846576, "grad_norm": 0.9518452826309246, "learning_rate": 2.020216467097915e-05, "loss": 0.1044, "step": 6510 }, { "epoch": 0.5069145053768673, "grad_norm": 1.151203961390492, "learning_rate": 2.019717311448186e-05, "loss": 0.1804, "step": 6511 }, { "epoch": 0.5069923604690769, "grad_norm": 1.1074940642749547, "learning_rate": 2.019218154570161e-05, "loss": 0.1158, "step": 6512 }, { "epoch": 0.5070702155612865, "grad_norm": 0.9752611826630776, "learning_rate": 2.0187189964949355e-05, "loss": 0.0939, "step": 6513 }, { "epoch": 0.5071480706534962, "grad_norm": 0.9971791929268634, "learning_rate": 2.0182198372536036e-05, "loss": 0.1041, "step": 6514 }, { "epoch": 0.5072259257457058, "grad_norm": 0.9561762580457313, "learning_rate": 2.0177206768772626e-05, "loss": 0.1216, "step": 6515 }, { "epoch": 0.5073037808379154, "grad_norm": 1.005802488647576, "learning_rate": 2.0172215153970062e-05, "loss": 0.1142, "step": 6516 }, { "epoch": 0.5073816359301251, "grad_norm": 1.0635787754599613, "learning_rate": 2.0167223528439304e-05, "loss": 0.1066, "step": 6517 }, { "epoch": 0.5074594910223347, "grad_norm": 1.076085354493202, "learning_rate": 2.0162231892491308e-05, "loss": 0.126, "step": 6518 }, { "epoch": 0.5075373461145443, "grad_norm": 1.007696967117223, "learning_rate": 2.015724024643702e-05, "loss": 0.1076, "step": 6519 }, { "epoch": 0.507615201206754, "grad_norm": 1.0912546196598225, "learning_rate": 2.0152248590587415e-05, "loss": 0.1451, "step": 6520 }, { "epoch": 0.5076930562989636, "grad_norm": 1.0268591526944644, "learning_rate": 2.014725692525343e-05, "loss": 0.1332, "step": 6521 }, { "epoch": 0.5077709113911731, "grad_norm": 0.9725597194901877, "learning_rate": 2.0142265250746035e-05, "loss": 0.1082, "step": 6522 }, { "epoch": 0.5078487664833828, "grad_norm": 1.0127367653013497, "learning_rate": 2.013727356737618e-05, "loss": 0.1117, "step": 6523 }, { "epoch": 0.5079266215755924, "grad_norm": 0.9524917906066792, "learning_rate": 2.013228187545483e-05, "loss": 0.0852, "step": 6524 }, { "epoch": 0.508004476667802, "grad_norm": 1.000655217506305, "learning_rate": 2.0127290175292946e-05, "loss": 0.1118, "step": 6525 }, { "epoch": 0.5080823317600117, "grad_norm": 1.0303804570878619, "learning_rate": 2.0122298467201476e-05, "loss": 0.1319, "step": 6526 }, { "epoch": 0.5081601868522213, "grad_norm": 0.9305552801264473, "learning_rate": 2.0117306751491383e-05, "loss": 0.0903, "step": 6527 }, { "epoch": 0.5082380419444309, "grad_norm": 1.0137710890979377, "learning_rate": 2.0112315028473634e-05, "loss": 0.1242, "step": 6528 }, { "epoch": 0.5083158970366406, "grad_norm": 1.0398859511892231, "learning_rate": 2.0107323298459187e-05, "loss": 0.1193, "step": 6529 }, { "epoch": 0.5083937521288502, "grad_norm": 0.9779974526008602, "learning_rate": 2.0102331561759007e-05, "loss": 0.1231, "step": 6530 }, { "epoch": 0.5084716072210598, "grad_norm": 1.0440607426811164, "learning_rate": 2.0097339818684047e-05, "loss": 0.1185, "step": 6531 }, { "epoch": 0.5085494623132695, "grad_norm": 1.0608041989975214, "learning_rate": 2.009234806954527e-05, "loss": 0.1014, "step": 6532 }, { "epoch": 0.5086273174054791, "grad_norm": 1.0860455835733729, "learning_rate": 2.0087356314653648e-05, "loss": 0.1169, "step": 6533 }, { "epoch": 0.5087051724976887, "grad_norm": 1.033643319711332, "learning_rate": 2.008236455432014e-05, "loss": 0.1271, "step": 6534 }, { "epoch": 0.5087830275898984, "grad_norm": 1.041141663993333, "learning_rate": 2.0077372788855704e-05, "loss": 0.1262, "step": 6535 }, { "epoch": 0.5088608826821079, "grad_norm": 1.00184152623432, "learning_rate": 2.007238101857131e-05, "loss": 0.0935, "step": 6536 }, { "epoch": 0.5089387377743175, "grad_norm": 0.9557380663885269, "learning_rate": 2.0067389243777912e-05, "loss": 0.0991, "step": 6537 }, { "epoch": 0.5090165928665272, "grad_norm": 1.0287611998652835, "learning_rate": 2.006239746478649e-05, "loss": 0.1274, "step": 6538 }, { "epoch": 0.5090944479587368, "grad_norm": 1.0910362485619265, "learning_rate": 2.0057405681907993e-05, "loss": 0.1401, "step": 6539 }, { "epoch": 0.5091723030509464, "grad_norm": 1.0504710099523902, "learning_rate": 2.0052413895453396e-05, "loss": 0.1327, "step": 6540 }, { "epoch": 0.509250158143156, "grad_norm": 0.9880687309454398, "learning_rate": 2.0047422105733653e-05, "loss": 0.0937, "step": 6541 }, { "epoch": 0.5093280132353657, "grad_norm": 0.9353485767365665, "learning_rate": 2.0042430313059737e-05, "loss": 0.1041, "step": 6542 }, { "epoch": 0.5094058683275753, "grad_norm": 1.026913674598407, "learning_rate": 2.0037438517742617e-05, "loss": 0.1413, "step": 6543 }, { "epoch": 0.5094837234197849, "grad_norm": 0.950344051539841, "learning_rate": 2.0032446720093257e-05, "loss": 0.0948, "step": 6544 }, { "epoch": 0.5095615785119946, "grad_norm": 1.0234285031710157, "learning_rate": 2.0027454920422608e-05, "loss": 0.1046, "step": 6545 }, { "epoch": 0.5096394336042042, "grad_norm": 0.922448230142435, "learning_rate": 2.0022463119041654e-05, "loss": 0.1006, "step": 6546 }, { "epoch": 0.5097172886964138, "grad_norm": 1.1047181250669467, "learning_rate": 2.0017471316261347e-05, "loss": 0.153, "step": 6547 }, { "epoch": 0.5097951437886235, "grad_norm": 1.0755349256754798, "learning_rate": 2.0012479512392664e-05, "loss": 0.1332, "step": 6548 }, { "epoch": 0.509872998880833, "grad_norm": 1.033183295538241, "learning_rate": 2.000748770774657e-05, "loss": 0.1092, "step": 6549 }, { "epoch": 0.5099508539730426, "grad_norm": 1.0395184855308663, "learning_rate": 2.0002495902634016e-05, "loss": 0.1303, "step": 6550 }, { "epoch": 0.5099508539730426, "eval_loss": 0.015034690499305725, "eval_runtime": 162.7588, "eval_samples_per_second": 17.695, "eval_steps_per_second": 0.633, "step": 6550 }, { "epoch": 0.5100287090652523, "grad_norm": 0.9648496614008649, "learning_rate": 1.9997504097365987e-05, "loss": 0.0887, "step": 6551 }, { "epoch": 0.5101065641574619, "grad_norm": 0.9530699691480735, "learning_rate": 1.999251229225344e-05, "loss": 0.1337, "step": 6552 }, { "epoch": 0.5101844192496715, "grad_norm": 0.9774846981261788, "learning_rate": 1.9987520487607343e-05, "loss": 0.0949, "step": 6553 }, { "epoch": 0.5102622743418812, "grad_norm": 1.0303033783355342, "learning_rate": 1.9982528683738653e-05, "loss": 0.1279, "step": 6554 }, { "epoch": 0.5103401294340908, "grad_norm": 1.0333669753778167, "learning_rate": 1.9977536880958353e-05, "loss": 0.1186, "step": 6555 }, { "epoch": 0.5104179845263004, "grad_norm": 1.1116006292767155, "learning_rate": 1.9972545079577396e-05, "loss": 0.1171, "step": 6556 }, { "epoch": 0.5104958396185101, "grad_norm": 1.0162361606380026, "learning_rate": 1.9967553279906754e-05, "loss": 0.1132, "step": 6557 }, { "epoch": 0.5105736947107197, "grad_norm": 0.9781286355484035, "learning_rate": 1.9962561482257386e-05, "loss": 0.0879, "step": 6558 }, { "epoch": 0.5106515498029293, "grad_norm": 1.1565128346472093, "learning_rate": 1.9957569686940263e-05, "loss": 0.1264, "step": 6559 }, { "epoch": 0.510729404895139, "grad_norm": 0.9651280175404753, "learning_rate": 1.995257789426635e-05, "loss": 0.0783, "step": 6560 }, { "epoch": 0.5108072599873485, "grad_norm": 1.0475783364791094, "learning_rate": 1.9947586104546614e-05, "loss": 0.1204, "step": 6561 }, { "epoch": 0.5108851150795581, "grad_norm": 1.0847107198774593, "learning_rate": 1.9942594318092013e-05, "loss": 0.1406, "step": 6562 }, { "epoch": 0.5109629701717678, "grad_norm": 1.0513480327401756, "learning_rate": 1.9937602535213518e-05, "loss": 0.1328, "step": 6563 }, { "epoch": 0.5110408252639774, "grad_norm": 1.1235286153826831, "learning_rate": 1.9932610756222088e-05, "loss": 0.1465, "step": 6564 }, { "epoch": 0.511118680356187, "grad_norm": 1.0230482978316195, "learning_rate": 1.9927618981428695e-05, "loss": 0.1307, "step": 6565 }, { "epoch": 0.5111965354483967, "grad_norm": 1.059721001907812, "learning_rate": 1.9922627211144302e-05, "loss": 0.1392, "step": 6566 }, { "epoch": 0.5112743905406063, "grad_norm": 0.9747323983038753, "learning_rate": 1.9917635445679866e-05, "loss": 0.1128, "step": 6567 }, { "epoch": 0.5113522456328159, "grad_norm": 1.0875235859035777, "learning_rate": 1.9912643685346356e-05, "loss": 0.1164, "step": 6568 }, { "epoch": 0.5114301007250256, "grad_norm": 1.0391936513402422, "learning_rate": 1.990765193045473e-05, "loss": 0.1277, "step": 6569 }, { "epoch": 0.5115079558172352, "grad_norm": 1.0219258121034396, "learning_rate": 1.990266018131596e-05, "loss": 0.1427, "step": 6570 }, { "epoch": 0.5115858109094448, "grad_norm": 1.0212407641773982, "learning_rate": 1.9897668438241e-05, "loss": 0.1041, "step": 6571 }, { "epoch": 0.5116636660016545, "grad_norm": 0.9546232597189019, "learning_rate": 1.9892676701540816e-05, "loss": 0.125, "step": 6572 }, { "epoch": 0.5117415210938641, "grad_norm": 1.0799606254841343, "learning_rate": 1.9887684971526376e-05, "loss": 0.1677, "step": 6573 }, { "epoch": 0.5118193761860736, "grad_norm": 1.0531684987453938, "learning_rate": 1.988269324850862e-05, "loss": 0.1145, "step": 6574 }, { "epoch": 0.5118972312782833, "grad_norm": 1.0472126861922144, "learning_rate": 1.987770153279853e-05, "loss": 0.1489, "step": 6575 }, { "epoch": 0.5119750863704929, "grad_norm": 1.022401287315946, "learning_rate": 1.987270982470706e-05, "loss": 0.1257, "step": 6576 }, { "epoch": 0.5120529414627025, "grad_norm": 1.1393492362066497, "learning_rate": 1.9867718124545175e-05, "loss": 0.1454, "step": 6577 }, { "epoch": 0.5121307965549121, "grad_norm": 1.114171904994204, "learning_rate": 1.986272643262383e-05, "loss": 0.1413, "step": 6578 }, { "epoch": 0.5122086516471218, "grad_norm": 0.9887420642161007, "learning_rate": 1.985773474925397e-05, "loss": 0.1178, "step": 6579 }, { "epoch": 0.5122865067393314, "grad_norm": 1.0797916032204105, "learning_rate": 1.9852743074746573e-05, "loss": 0.1157, "step": 6580 }, { "epoch": 0.512364361831541, "grad_norm": 0.9321679270397359, "learning_rate": 1.984775140941259e-05, "loss": 0.1138, "step": 6581 }, { "epoch": 0.5124422169237507, "grad_norm": 0.9201236688016892, "learning_rate": 1.9842759753562987e-05, "loss": 0.1032, "step": 6582 }, { "epoch": 0.5125200720159603, "grad_norm": 1.0721676620410565, "learning_rate": 1.9837768107508702e-05, "loss": 0.125, "step": 6583 }, { "epoch": 0.5125979271081699, "grad_norm": 1.0094587282480436, "learning_rate": 1.9832776471560696e-05, "loss": 0.1238, "step": 6584 }, { "epoch": 0.5126757822003796, "grad_norm": 0.9887747675331005, "learning_rate": 1.982778484602994e-05, "loss": 0.1099, "step": 6585 }, { "epoch": 0.5127536372925892, "grad_norm": 0.9846824920396245, "learning_rate": 1.9822793231227377e-05, "loss": 0.0964, "step": 6586 }, { "epoch": 0.5128314923847987, "grad_norm": 1.0714586754451663, "learning_rate": 1.9817801627463967e-05, "loss": 0.1438, "step": 6587 }, { "epoch": 0.5129093474770084, "grad_norm": 1.0797672146047685, "learning_rate": 1.9812810035050655e-05, "loss": 0.14, "step": 6588 }, { "epoch": 0.512987202569218, "grad_norm": 1.0483725575586724, "learning_rate": 1.98078184542984e-05, "loss": 0.1401, "step": 6589 }, { "epoch": 0.5130650576614276, "grad_norm": 0.9421700032440496, "learning_rate": 1.980282688551814e-05, "loss": 0.091, "step": 6590 }, { "epoch": 0.5131429127536373, "grad_norm": 0.9690338710750598, "learning_rate": 1.9797835329020852e-05, "loss": 0.097, "step": 6591 }, { "epoch": 0.5132207678458469, "grad_norm": 1.0604158829204013, "learning_rate": 1.979284378511747e-05, "loss": 0.1273, "step": 6592 }, { "epoch": 0.5132986229380565, "grad_norm": 0.923456773674791, "learning_rate": 1.9787852254118947e-05, "loss": 0.1089, "step": 6593 }, { "epoch": 0.5133764780302662, "grad_norm": 0.9975130146997512, "learning_rate": 1.9782860736336228e-05, "loss": 0.1449, "step": 6594 }, { "epoch": 0.5134543331224758, "grad_norm": 1.0346592629975517, "learning_rate": 1.977786923208026e-05, "loss": 0.1254, "step": 6595 }, { "epoch": 0.5135321882146854, "grad_norm": 0.906685796901837, "learning_rate": 1.9772877741662003e-05, "loss": 0.1127, "step": 6596 }, { "epoch": 0.5136100433068951, "grad_norm": 0.9196445046944448, "learning_rate": 1.9767886265392396e-05, "loss": 0.1184, "step": 6597 }, { "epoch": 0.5136878983991047, "grad_norm": 0.9229605244913802, "learning_rate": 1.976289480358238e-05, "loss": 0.0822, "step": 6598 }, { "epoch": 0.5137657534913143, "grad_norm": 1.0755089087510026, "learning_rate": 1.9757903356542904e-05, "loss": 0.1293, "step": 6599 }, { "epoch": 0.513843608583524, "grad_norm": 0.9340084173172201, "learning_rate": 1.9752911924584907e-05, "loss": 0.0947, "step": 6600 }, { "epoch": 0.513843608583524, "eval_loss": 0.014656373299658298, "eval_runtime": 163.0844, "eval_samples_per_second": 17.66, "eval_steps_per_second": 0.632, "step": 6600 }, { "epoch": 0.5139214636757335, "grad_norm": 1.0763792588020806, "learning_rate": 1.974792050801934e-05, "loss": 0.1412, "step": 6601 }, { "epoch": 0.5139993187679431, "grad_norm": 1.0172265143061303, "learning_rate": 1.974292910715714e-05, "loss": 0.106, "step": 6602 }, { "epoch": 0.5140771738601528, "grad_norm": 1.0173611734030807, "learning_rate": 1.9737937722309247e-05, "loss": 0.1146, "step": 6603 }, { "epoch": 0.5141550289523624, "grad_norm": 1.042102859239062, "learning_rate": 1.9732946353786605e-05, "loss": 0.1214, "step": 6604 }, { "epoch": 0.514232884044572, "grad_norm": 0.98838950668062, "learning_rate": 1.9727955001900144e-05, "loss": 0.1052, "step": 6605 }, { "epoch": 0.5143107391367817, "grad_norm": 1.0079784114168044, "learning_rate": 1.9722963666960813e-05, "loss": 0.1292, "step": 6606 }, { "epoch": 0.5143885942289913, "grad_norm": 0.9079049747913902, "learning_rate": 1.9717972349279545e-05, "loss": 0.1206, "step": 6607 }, { "epoch": 0.5144664493212009, "grad_norm": 0.9755922901311133, "learning_rate": 1.971298104916727e-05, "loss": 0.1013, "step": 6608 }, { "epoch": 0.5145443044134106, "grad_norm": 0.9596250914265758, "learning_rate": 1.9707989766934928e-05, "loss": 0.0936, "step": 6609 }, { "epoch": 0.5146221595056202, "grad_norm": 0.9907084331564286, "learning_rate": 1.9702998502893448e-05, "loss": 0.1202, "step": 6610 }, { "epoch": 0.5147000145978298, "grad_norm": 1.0080866352989977, "learning_rate": 1.9698007257353766e-05, "loss": 0.0924, "step": 6611 }, { "epoch": 0.5147778696900395, "grad_norm": 0.9595195249647333, "learning_rate": 1.9693016030626814e-05, "loss": 0.0833, "step": 6612 }, { "epoch": 0.5148557247822491, "grad_norm": 1.030192181150222, "learning_rate": 1.968802482302352e-05, "loss": 0.1283, "step": 6613 }, { "epoch": 0.5149335798744586, "grad_norm": 1.08296794712789, "learning_rate": 1.9683033634854817e-05, "loss": 0.1221, "step": 6614 }, { "epoch": 0.5150114349666682, "grad_norm": 1.077669692461, "learning_rate": 1.967804246643162e-05, "loss": 0.0934, "step": 6615 }, { "epoch": 0.5150892900588779, "grad_norm": 0.9976245025079115, "learning_rate": 1.9673051318064872e-05, "loss": 0.1054, "step": 6616 }, { "epoch": 0.5151671451510875, "grad_norm": 1.0060135032038193, "learning_rate": 1.966806019006549e-05, "loss": 0.0893, "step": 6617 }, { "epoch": 0.5152450002432971, "grad_norm": 1.0212065686637095, "learning_rate": 1.9663069082744392e-05, "loss": 0.0971, "step": 6618 }, { "epoch": 0.5153228553355068, "grad_norm": 0.9732706850032709, "learning_rate": 1.965807799641251e-05, "loss": 0.1047, "step": 6619 }, { "epoch": 0.5154007104277164, "grad_norm": 1.036818198754567, "learning_rate": 1.9653086931380757e-05, "loss": 0.1245, "step": 6620 }, { "epoch": 0.515478565519926, "grad_norm": 0.9184636238587366, "learning_rate": 1.9648095887960064e-05, "loss": 0.0902, "step": 6621 }, { "epoch": 0.5155564206121357, "grad_norm": 0.9384087317077495, "learning_rate": 1.964310486646134e-05, "loss": 0.1162, "step": 6622 }, { "epoch": 0.5156342757043453, "grad_norm": 0.9231079330819567, "learning_rate": 1.9638113867195506e-05, "loss": 0.1157, "step": 6623 }, { "epoch": 0.5157121307965549, "grad_norm": 0.8546827540083693, "learning_rate": 1.963312289047348e-05, "loss": 0.0893, "step": 6624 }, { "epoch": 0.5157899858887646, "grad_norm": 1.0196420426384938, "learning_rate": 1.9628131936606167e-05, "loss": 0.123, "step": 6625 }, { "epoch": 0.5158678409809742, "grad_norm": 1.1687463087421825, "learning_rate": 1.962314100590449e-05, "loss": 0.1472, "step": 6626 }, { "epoch": 0.5159456960731837, "grad_norm": 0.976180119762428, "learning_rate": 1.961815009867936e-05, "loss": 0.1095, "step": 6627 }, { "epoch": 0.5160235511653934, "grad_norm": 1.0382066139405726, "learning_rate": 1.961315921524168e-05, "loss": 0.1055, "step": 6628 }, { "epoch": 0.516101406257603, "grad_norm": 0.9683650205032192, "learning_rate": 1.9608168355902362e-05, "loss": 0.0962, "step": 6629 }, { "epoch": 0.5161792613498126, "grad_norm": 1.0311299631400228, "learning_rate": 1.960317752097232e-05, "loss": 0.1087, "step": 6630 }, { "epoch": 0.5162571164420223, "grad_norm": 0.9851213417879909, "learning_rate": 1.9598186710762442e-05, "loss": 0.126, "step": 6631 }, { "epoch": 0.5163349715342319, "grad_norm": 1.1438728483329883, "learning_rate": 1.959319592558365e-05, "loss": 0.1207, "step": 6632 }, { "epoch": 0.5164128266264415, "grad_norm": 1.0302552250969146, "learning_rate": 1.958820516574684e-05, "loss": 0.1405, "step": 6633 }, { "epoch": 0.5164906817186512, "grad_norm": 1.0505446154465603, "learning_rate": 1.9583214431562915e-05, "loss": 0.1258, "step": 6634 }, { "epoch": 0.5165685368108608, "grad_norm": 0.9593739705548185, "learning_rate": 1.9578223723342774e-05, "loss": 0.0887, "step": 6635 }, { "epoch": 0.5166463919030704, "grad_norm": 0.9527251189162765, "learning_rate": 1.9573233041397297e-05, "loss": 0.1052, "step": 6636 }, { "epoch": 0.5167242469952801, "grad_norm": 0.8989191218609084, "learning_rate": 1.9568242386037405e-05, "loss": 0.108, "step": 6637 }, { "epoch": 0.5168021020874897, "grad_norm": 0.929628136062849, "learning_rate": 1.9563251757573988e-05, "loss": 0.1214, "step": 6638 }, { "epoch": 0.5168799571796993, "grad_norm": 0.9984125744863244, "learning_rate": 1.9558261156317927e-05, "loss": 0.1061, "step": 6639 }, { "epoch": 0.516957812271909, "grad_norm": 1.1192563645722284, "learning_rate": 1.955327058258013e-05, "loss": 0.1257, "step": 6640 }, { "epoch": 0.5170356673641185, "grad_norm": 0.9607022784073221, "learning_rate": 1.9548280036671457e-05, "loss": 0.1244, "step": 6641 }, { "epoch": 0.5171135224563281, "grad_norm": 1.0765178422459476, "learning_rate": 1.9543289518902825e-05, "loss": 0.1418, "step": 6642 }, { "epoch": 0.5171913775485378, "grad_norm": 0.9786926198991102, "learning_rate": 1.9538299029585112e-05, "loss": 0.1168, "step": 6643 }, { "epoch": 0.5172692326407474, "grad_norm": 0.9214535927913351, "learning_rate": 1.9533308569029202e-05, "loss": 0.1069, "step": 6644 }, { "epoch": 0.517347087732957, "grad_norm": 0.9918710320601387, "learning_rate": 1.952831813754597e-05, "loss": 0.112, "step": 6645 }, { "epoch": 0.5174249428251667, "grad_norm": 1.0833308050783472, "learning_rate": 1.952332773544629e-05, "loss": 0.1271, "step": 6646 }, { "epoch": 0.5175027979173763, "grad_norm": 1.0417663859707136, "learning_rate": 1.9518337363041066e-05, "loss": 0.1189, "step": 6647 }, { "epoch": 0.5175806530095859, "grad_norm": 1.065949008369171, "learning_rate": 1.9513347020641156e-05, "loss": 0.1716, "step": 6648 }, { "epoch": 0.5176585081017955, "grad_norm": 0.9985140184983887, "learning_rate": 1.9508356708557444e-05, "loss": 0.1086, "step": 6649 }, { "epoch": 0.5177363631940052, "grad_norm": 1.0428521385567517, "learning_rate": 1.9503366427100792e-05, "loss": 0.1175, "step": 6650 }, { "epoch": 0.5177363631940052, "eval_loss": 0.014382991008460522, "eval_runtime": 163.1761, "eval_samples_per_second": 17.65, "eval_steps_per_second": 0.631, "step": 6650 }, { "epoch": 0.5178142182862148, "grad_norm": 1.032978429040154, "learning_rate": 1.9498376176582074e-05, "loss": 0.1409, "step": 6651 }, { "epoch": 0.5178920733784244, "grad_norm": 1.0182377224345431, "learning_rate": 1.9493385957312166e-05, "loss": 0.1059, "step": 6652 }, { "epoch": 0.5179699284706341, "grad_norm": 1.0521538427999713, "learning_rate": 1.9488395769601936e-05, "loss": 0.1311, "step": 6653 }, { "epoch": 0.5180477835628436, "grad_norm": 1.0093336337831722, "learning_rate": 1.9483405613762243e-05, "loss": 0.12, "step": 6654 }, { "epoch": 0.5181256386550532, "grad_norm": 0.9774469660121945, "learning_rate": 1.947841549010395e-05, "loss": 0.1242, "step": 6655 }, { "epoch": 0.5182034937472629, "grad_norm": 1.0234203103838595, "learning_rate": 1.9473425398937913e-05, "loss": 0.1075, "step": 6656 }, { "epoch": 0.5182813488394725, "grad_norm": 1.57998091259139, "learning_rate": 1.9468435340575008e-05, "loss": 0.1215, "step": 6657 }, { "epoch": 0.5183592039316821, "grad_norm": 0.9149580387131971, "learning_rate": 1.9463445315326084e-05, "loss": 0.0819, "step": 6658 }, { "epoch": 0.5184370590238918, "grad_norm": 0.9605653066851885, "learning_rate": 1.945845532350199e-05, "loss": 0.1063, "step": 6659 }, { "epoch": 0.5185149141161014, "grad_norm": 0.9855758501257111, "learning_rate": 1.9453465365413588e-05, "loss": 0.1061, "step": 6660 }, { "epoch": 0.518592769208311, "grad_norm": 0.9806056601210211, "learning_rate": 1.9448475441371715e-05, "loss": 0.1118, "step": 6661 }, { "epoch": 0.5186706243005207, "grad_norm": 1.0397587077980825, "learning_rate": 1.9443485551687243e-05, "loss": 0.1374, "step": 6662 }, { "epoch": 0.5187484793927303, "grad_norm": 1.0198877411821474, "learning_rate": 1.9438495696671e-05, "loss": 0.0907, "step": 6663 }, { "epoch": 0.5188263344849399, "grad_norm": 1.011873486191733, "learning_rate": 1.9433505876633835e-05, "loss": 0.1396, "step": 6664 }, { "epoch": 0.5189041895771496, "grad_norm": 1.0668582190645006, "learning_rate": 1.942851609188659e-05, "loss": 0.1309, "step": 6665 }, { "epoch": 0.5189820446693592, "grad_norm": 0.9445327266829266, "learning_rate": 1.9423526342740107e-05, "loss": 0.1036, "step": 6666 }, { "epoch": 0.5190598997615687, "grad_norm": 1.0071204187523193, "learning_rate": 1.9418536629505222e-05, "loss": 0.1102, "step": 6667 }, { "epoch": 0.5191377548537784, "grad_norm": 1.1690481658675245, "learning_rate": 1.9413546952492773e-05, "loss": 0.1522, "step": 6668 }, { "epoch": 0.519215609945988, "grad_norm": 0.935133801910071, "learning_rate": 1.9408557312013597e-05, "loss": 0.0916, "step": 6669 }, { "epoch": 0.5192934650381976, "grad_norm": 1.0420068342495366, "learning_rate": 1.9403567708378514e-05, "loss": 0.1257, "step": 6670 }, { "epoch": 0.5193713201304073, "grad_norm": 1.190969964223587, "learning_rate": 1.9398578141898362e-05, "loss": 0.1126, "step": 6671 }, { "epoch": 0.5194491752226169, "grad_norm": 0.9580898455639542, "learning_rate": 1.939358861288396e-05, "loss": 0.102, "step": 6672 }, { "epoch": 0.5195270303148265, "grad_norm": 0.9635021060549088, "learning_rate": 1.9388599121646142e-05, "loss": 0.0888, "step": 6673 }, { "epoch": 0.5196048854070362, "grad_norm": 0.9898862079520326, "learning_rate": 1.9383609668495725e-05, "loss": 0.0947, "step": 6674 }, { "epoch": 0.5196827404992458, "grad_norm": 1.0235224132250076, "learning_rate": 1.937862025374353e-05, "loss": 0.1469, "step": 6675 }, { "epoch": 0.5197605955914554, "grad_norm": 0.9242852275860872, "learning_rate": 1.937363087770037e-05, "loss": 0.1114, "step": 6676 }, { "epoch": 0.5198384506836651, "grad_norm": 0.9974971335843452, "learning_rate": 1.9368641540677056e-05, "loss": 0.1191, "step": 6677 }, { "epoch": 0.5199163057758747, "grad_norm": 0.9726796616284169, "learning_rate": 1.9363652242984416e-05, "loss": 0.1157, "step": 6678 }, { "epoch": 0.5199941608680843, "grad_norm": 1.0331937904048778, "learning_rate": 1.9358662984933246e-05, "loss": 0.124, "step": 6679 }, { "epoch": 0.520072015960294, "grad_norm": 0.9642138792370094, "learning_rate": 1.935367376683436e-05, "loss": 0.0833, "step": 6680 }, { "epoch": 0.5201498710525035, "grad_norm": 0.9832337905023765, "learning_rate": 1.934868458899856e-05, "loss": 0.108, "step": 6681 }, { "epoch": 0.5202277261447131, "grad_norm": 0.9728587306338475, "learning_rate": 1.9343695451736642e-05, "loss": 0.1081, "step": 6682 }, { "epoch": 0.5203055812369228, "grad_norm": 1.0690122751480815, "learning_rate": 1.9338706355359417e-05, "loss": 0.1234, "step": 6683 }, { "epoch": 0.5203834363291324, "grad_norm": 1.0849671462200499, "learning_rate": 1.9333717300177683e-05, "loss": 0.1376, "step": 6684 }, { "epoch": 0.520461291421342, "grad_norm": 1.0779355353360631, "learning_rate": 1.9328728286502223e-05, "loss": 0.1053, "step": 6685 }, { "epoch": 0.5205391465135516, "grad_norm": 1.0614144134690364, "learning_rate": 1.932373931464384e-05, "loss": 0.1346, "step": 6686 }, { "epoch": 0.5206170016057613, "grad_norm": 0.9352207803504705, "learning_rate": 1.9318750384913308e-05, "loss": 0.0892, "step": 6687 }, { "epoch": 0.5206948566979709, "grad_norm": 0.9991232702803658, "learning_rate": 1.9313761497621436e-05, "loss": 0.1521, "step": 6688 }, { "epoch": 0.5207727117901805, "grad_norm": 1.0714526578787023, "learning_rate": 1.9308772653078995e-05, "loss": 0.1099, "step": 6689 }, { "epoch": 0.5208505668823902, "grad_norm": 0.9338255654871496, "learning_rate": 1.9303783851596767e-05, "loss": 0.0979, "step": 6690 }, { "epoch": 0.5209284219745998, "grad_norm": 0.9652883003196444, "learning_rate": 1.9298795093485534e-05, "loss": 0.1005, "step": 6691 }, { "epoch": 0.5210062770668094, "grad_norm": 0.9784551123437407, "learning_rate": 1.9293806379056068e-05, "loss": 0.1264, "step": 6692 }, { "epoch": 0.521084132159019, "grad_norm": 1.0040042645362792, "learning_rate": 1.9288817708619148e-05, "loss": 0.1348, "step": 6693 }, { "epoch": 0.5211619872512286, "grad_norm": 1.0299902311302087, "learning_rate": 1.928382908248554e-05, "loss": 0.0932, "step": 6694 }, { "epoch": 0.5212398423434382, "grad_norm": 0.995235624710577, "learning_rate": 1.9278840500966017e-05, "loss": 0.1086, "step": 6695 }, { "epoch": 0.5213176974356479, "grad_norm": 0.9222996251423938, "learning_rate": 1.927385196437134e-05, "loss": 0.0804, "step": 6696 }, { "epoch": 0.5213955525278575, "grad_norm": 1.004743790986653, "learning_rate": 1.9268863473012268e-05, "loss": 0.1109, "step": 6697 }, { "epoch": 0.5214734076200671, "grad_norm": 0.9333917893917953, "learning_rate": 1.9263875027199568e-05, "loss": 0.0964, "step": 6698 }, { "epoch": 0.5215512627122768, "grad_norm": 0.9421169587800959, "learning_rate": 1.9258886627243996e-05, "loss": 0.1138, "step": 6699 }, { "epoch": 0.5216291178044864, "grad_norm": 0.9528797835508284, "learning_rate": 1.9253898273456303e-05, "loss": 0.126, "step": 6700 }, { "epoch": 0.5216291178044864, "eval_loss": 0.014056540094316006, "eval_runtime": 163.34, "eval_samples_per_second": 17.632, "eval_steps_per_second": 0.631, "step": 6700 }, { "epoch": 0.521706972896696, "grad_norm": 0.9333103662186971, "learning_rate": 1.924890996614724e-05, "loss": 0.0951, "step": 6701 }, { "epoch": 0.5217848279889057, "grad_norm": 1.0171457853376438, "learning_rate": 1.9243921705627547e-05, "loss": 0.1202, "step": 6702 }, { "epoch": 0.5218626830811153, "grad_norm": 0.9330334905027199, "learning_rate": 1.923893349220799e-05, "loss": 0.1083, "step": 6703 }, { "epoch": 0.5219405381733249, "grad_norm": 1.0017823976855071, "learning_rate": 1.9233945326199295e-05, "loss": 0.123, "step": 6704 }, { "epoch": 0.5220183932655346, "grad_norm": 1.0573458166115581, "learning_rate": 1.9228957207912203e-05, "loss": 0.1233, "step": 6705 }, { "epoch": 0.5220962483577442, "grad_norm": 0.9507680724331565, "learning_rate": 1.922396913765746e-05, "loss": 0.1032, "step": 6706 }, { "epoch": 0.5221741034499537, "grad_norm": 0.9611167405558069, "learning_rate": 1.921898111574579e-05, "loss": 0.1056, "step": 6707 }, { "epoch": 0.5222519585421634, "grad_norm": 0.9089296727389334, "learning_rate": 1.9213993142487913e-05, "loss": 0.0974, "step": 6708 }, { "epoch": 0.522329813634373, "grad_norm": 0.9279031943849944, "learning_rate": 1.920900521819458e-05, "loss": 0.1029, "step": 6709 }, { "epoch": 0.5224076687265826, "grad_norm": 0.9355280689071058, "learning_rate": 1.92040173431765e-05, "loss": 0.0992, "step": 6710 }, { "epoch": 0.5224855238187923, "grad_norm": 0.9989151497009755, "learning_rate": 1.9199029517744406e-05, "loss": 0.1172, "step": 6711 }, { "epoch": 0.5225633789110019, "grad_norm": 1.0087453031071711, "learning_rate": 1.9194041742209003e-05, "loss": 0.1308, "step": 6712 }, { "epoch": 0.5226412340032115, "grad_norm": 0.9586828619860449, "learning_rate": 1.9189054016881004e-05, "loss": 0.1199, "step": 6713 }, { "epoch": 0.5227190890954212, "grad_norm": 0.9792435030387537, "learning_rate": 1.9184066342071134e-05, "loss": 0.0877, "step": 6714 }, { "epoch": 0.5227969441876308, "grad_norm": 0.9518724293706992, "learning_rate": 1.9179078718090098e-05, "loss": 0.1363, "step": 6715 }, { "epoch": 0.5228747992798404, "grad_norm": 1.0077649074566601, "learning_rate": 1.9174091145248602e-05, "loss": 0.1295, "step": 6716 }, { "epoch": 0.5229526543720501, "grad_norm": 0.8920555715102562, "learning_rate": 1.916910362385734e-05, "loss": 0.0832, "step": 6717 }, { "epoch": 0.5230305094642597, "grad_norm": 0.9585682091069823, "learning_rate": 1.916411615422701e-05, "loss": 0.1177, "step": 6718 }, { "epoch": 0.5231083645564693, "grad_norm": 1.0209445106849102, "learning_rate": 1.9159128736668323e-05, "loss": 0.0976, "step": 6719 }, { "epoch": 0.523186219648679, "grad_norm": 0.9806481563943901, "learning_rate": 1.9154141371491966e-05, "loss": 0.0959, "step": 6720 }, { "epoch": 0.5232640747408885, "grad_norm": 0.9818792158665898, "learning_rate": 1.9149154059008625e-05, "loss": 0.1121, "step": 6721 }, { "epoch": 0.5233419298330981, "grad_norm": 0.9564281535703725, "learning_rate": 1.9144166799528984e-05, "loss": 0.1147, "step": 6722 }, { "epoch": 0.5234197849253077, "grad_norm": 1.0632341968484342, "learning_rate": 1.913917959336372e-05, "loss": 0.0939, "step": 6723 }, { "epoch": 0.5234976400175174, "grad_norm": 1.036637611100749, "learning_rate": 1.913419244082353e-05, "loss": 0.093, "step": 6724 }, { "epoch": 0.523575495109727, "grad_norm": 1.0098059746738, "learning_rate": 1.9129205342219085e-05, "loss": 0.1334, "step": 6725 }, { "epoch": 0.5236533502019366, "grad_norm": 0.9706857013836443, "learning_rate": 1.9124218297861052e-05, "loss": 0.11, "step": 6726 }, { "epoch": 0.5237312052941463, "grad_norm": 0.8961899204267016, "learning_rate": 1.91192313080601e-05, "loss": 0.0841, "step": 6727 }, { "epoch": 0.5238090603863559, "grad_norm": 0.9619642319414694, "learning_rate": 1.911424437312689e-05, "loss": 0.1065, "step": 6728 }, { "epoch": 0.5238869154785655, "grad_norm": 0.9699961112137956, "learning_rate": 1.9109257493372107e-05, "loss": 0.1149, "step": 6729 }, { "epoch": 0.5239647705707752, "grad_norm": 0.9357097661982691, "learning_rate": 1.910427066910639e-05, "loss": 0.0917, "step": 6730 }, { "epoch": 0.5240426256629848, "grad_norm": 0.9974044523875418, "learning_rate": 1.9099283900640397e-05, "loss": 0.1077, "step": 6731 }, { "epoch": 0.5241204807551944, "grad_norm": 1.02112558064595, "learning_rate": 1.9094297188284785e-05, "loss": 0.1042, "step": 6732 }, { "epoch": 0.524198335847404, "grad_norm": 1.0539032344604333, "learning_rate": 1.9089310532350193e-05, "loss": 0.1305, "step": 6733 }, { "epoch": 0.5242761909396136, "grad_norm": 0.9994839697242064, "learning_rate": 1.9084323933147286e-05, "loss": 0.1418, "step": 6734 }, { "epoch": 0.5243540460318232, "grad_norm": 0.8579112937778316, "learning_rate": 1.9079337390986688e-05, "loss": 0.084, "step": 6735 }, { "epoch": 0.5244319011240329, "grad_norm": 0.9100106405831931, "learning_rate": 1.907435090617905e-05, "loss": 0.11, "step": 6736 }, { "epoch": 0.5245097562162425, "grad_norm": 0.9243610963653559, "learning_rate": 1.9069364479034993e-05, "loss": 0.1078, "step": 6737 }, { "epoch": 0.5245876113084521, "grad_norm": 0.9715853695655803, "learning_rate": 1.906437810986515e-05, "loss": 0.0999, "step": 6738 }, { "epoch": 0.5246654664006618, "grad_norm": 0.9172938623149505, "learning_rate": 1.9059391798980163e-05, "loss": 0.1249, "step": 6739 }, { "epoch": 0.5247433214928714, "grad_norm": 1.0067081140346943, "learning_rate": 1.905440554669064e-05, "loss": 0.1073, "step": 6740 }, { "epoch": 0.524821176585081, "grad_norm": 0.9702304248136289, "learning_rate": 1.904941935330721e-05, "loss": 0.1015, "step": 6741 }, { "epoch": 0.5248990316772907, "grad_norm": 0.9411800173517868, "learning_rate": 1.9044433219140485e-05, "loss": 0.0979, "step": 6742 }, { "epoch": 0.5249768867695003, "grad_norm": 0.9742521423789055, "learning_rate": 1.903944714450108e-05, "loss": 0.1243, "step": 6743 }, { "epoch": 0.5250547418617099, "grad_norm": 1.1262040300952219, "learning_rate": 1.9034461129699598e-05, "loss": 0.1231, "step": 6744 }, { "epoch": 0.5251325969539196, "grad_norm": 0.9311745368491583, "learning_rate": 1.9029475175046653e-05, "loss": 0.0938, "step": 6745 }, { "epoch": 0.5252104520461292, "grad_norm": 0.9976131670531199, "learning_rate": 1.9024489280852843e-05, "loss": 0.1055, "step": 6746 }, { "epoch": 0.5252883071383387, "grad_norm": 0.9737965675985267, "learning_rate": 1.9019503447428764e-05, "loss": 0.1014, "step": 6747 }, { "epoch": 0.5253661622305484, "grad_norm": 1.06585606326243, "learning_rate": 1.9014517675085014e-05, "loss": 0.1247, "step": 6748 }, { "epoch": 0.525444017322758, "grad_norm": 0.9143425671848248, "learning_rate": 1.9009531964132172e-05, "loss": 0.1048, "step": 6749 }, { "epoch": 0.5255218724149676, "grad_norm": 0.923463163744325, "learning_rate": 1.9004546314880842e-05, "loss": 0.1103, "step": 6750 }, { "epoch": 0.5255218724149676, "eval_loss": 0.013936766423285007, "eval_runtime": 162.1852, "eval_samples_per_second": 17.757, "eval_steps_per_second": 0.635, "step": 6750 }, { "epoch": 0.5255997275071773, "grad_norm": 0.922303852988047, "learning_rate": 1.8999560727641598e-05, "loss": 0.1142, "step": 6751 }, { "epoch": 0.5256775825993869, "grad_norm": 0.9558348919301568, "learning_rate": 1.8994575202725014e-05, "loss": 0.1114, "step": 6752 }, { "epoch": 0.5257554376915965, "grad_norm": 0.9818434842655736, "learning_rate": 1.8989589740441673e-05, "loss": 0.1101, "step": 6753 }, { "epoch": 0.5258332927838062, "grad_norm": 0.8800483707087562, "learning_rate": 1.8984604341102135e-05, "loss": 0.088, "step": 6754 }, { "epoch": 0.5259111478760158, "grad_norm": 0.9843616457374124, "learning_rate": 1.897961900501698e-05, "loss": 0.1209, "step": 6755 }, { "epoch": 0.5259890029682254, "grad_norm": 1.103307041155011, "learning_rate": 1.8974633732496762e-05, "loss": 0.1257, "step": 6756 }, { "epoch": 0.526066858060435, "grad_norm": 0.9386987029655561, "learning_rate": 1.8969648523852046e-05, "loss": 0.0974, "step": 6757 }, { "epoch": 0.5261447131526447, "grad_norm": 0.9801438579095786, "learning_rate": 1.896466337939338e-05, "loss": 0.1037, "step": 6758 }, { "epoch": 0.5262225682448542, "grad_norm": 0.854654754624922, "learning_rate": 1.8959678299431318e-05, "loss": 0.0823, "step": 6759 }, { "epoch": 0.5263004233370638, "grad_norm": 0.9868224059782247, "learning_rate": 1.895469328427641e-05, "loss": 0.1085, "step": 6760 }, { "epoch": 0.5263782784292735, "grad_norm": 0.9973857346317111, "learning_rate": 1.89497083342392e-05, "loss": 0.0998, "step": 6761 }, { "epoch": 0.5264561335214831, "grad_norm": 1.0156040083113, "learning_rate": 1.894472344963022e-05, "loss": 0.1299, "step": 6762 }, { "epoch": 0.5265339886136927, "grad_norm": 0.9913922706753594, "learning_rate": 1.8939738630760013e-05, "loss": 0.127, "step": 6763 }, { "epoch": 0.5266118437059024, "grad_norm": 0.9866180211910729, "learning_rate": 1.8934753877939098e-05, "loss": 0.1243, "step": 6764 }, { "epoch": 0.526689698798112, "grad_norm": 1.0536168461925501, "learning_rate": 1.8929769191478016e-05, "loss": 0.1161, "step": 6765 }, { "epoch": 0.5267675538903216, "grad_norm": 1.0511404314776287, "learning_rate": 1.892478457168728e-05, "loss": 0.13, "step": 6766 }, { "epoch": 0.5268454089825313, "grad_norm": 0.8572718122244721, "learning_rate": 1.8919800018877415e-05, "loss": 0.079, "step": 6767 }, { "epoch": 0.5269232640747409, "grad_norm": 1.01420897200376, "learning_rate": 1.891481553335893e-05, "loss": 0.1514, "step": 6768 }, { "epoch": 0.5270011191669505, "grad_norm": 0.9423381102188934, "learning_rate": 1.890983111544233e-05, "loss": 0.1078, "step": 6769 }, { "epoch": 0.5270789742591602, "grad_norm": 0.9713455391986744, "learning_rate": 1.8904846765438133e-05, "loss": 0.1081, "step": 6770 }, { "epoch": 0.5271568293513698, "grad_norm": 0.9937063965441547, "learning_rate": 1.8899862483656834e-05, "loss": 0.1102, "step": 6771 }, { "epoch": 0.5272346844435793, "grad_norm": 0.9020572004227942, "learning_rate": 1.8894878270408933e-05, "loss": 0.0729, "step": 6772 }, { "epoch": 0.527312539535789, "grad_norm": 0.9410994542922432, "learning_rate": 1.8889894126004927e-05, "loss": 0.1155, "step": 6773 }, { "epoch": 0.5273903946279986, "grad_norm": 0.9155730793045421, "learning_rate": 1.888491005075528e-05, "loss": 0.1024, "step": 6774 }, { "epoch": 0.5274682497202082, "grad_norm": 0.9523633270017334, "learning_rate": 1.887992604497051e-05, "loss": 0.1054, "step": 6775 }, { "epoch": 0.5275461048124179, "grad_norm": 0.9599706208623336, "learning_rate": 1.887494210896108e-05, "loss": 0.1103, "step": 6776 }, { "epoch": 0.5276239599046275, "grad_norm": 0.9407678724886411, "learning_rate": 1.886995824303747e-05, "loss": 0.1054, "step": 6777 }, { "epoch": 0.5277018149968371, "grad_norm": 1.0041943216809475, "learning_rate": 1.886497444751015e-05, "loss": 0.1072, "step": 6778 }, { "epoch": 0.5277796700890468, "grad_norm": 0.9213776727901272, "learning_rate": 1.885999072268958e-05, "loss": 0.1258, "step": 6779 }, { "epoch": 0.5278575251812564, "grad_norm": 0.936669349028591, "learning_rate": 1.8855007068886232e-05, "loss": 0.1175, "step": 6780 }, { "epoch": 0.527935380273466, "grad_norm": 0.9502969009337742, "learning_rate": 1.8850023486410568e-05, "loss": 0.1153, "step": 6781 }, { "epoch": 0.5280132353656757, "grad_norm": 0.96450793398822, "learning_rate": 1.8845039975573033e-05, "loss": 0.1089, "step": 6782 }, { "epoch": 0.5280910904578853, "grad_norm": 0.920918577417274, "learning_rate": 1.884005653668408e-05, "loss": 0.096, "step": 6783 }, { "epoch": 0.5281689455500949, "grad_norm": 0.9356262042115518, "learning_rate": 1.883507317005415e-05, "loss": 0.1014, "step": 6784 }, { "epoch": 0.5282468006423046, "grad_norm": 0.9053154652447936, "learning_rate": 1.883008987599368e-05, "loss": 0.1149, "step": 6785 }, { "epoch": 0.5283246557345141, "grad_norm": 0.9592053650612122, "learning_rate": 1.8825106654813115e-05, "loss": 0.1142, "step": 6786 }, { "epoch": 0.5284025108267237, "grad_norm": 0.9056043722864834, "learning_rate": 1.8820123506822888e-05, "loss": 0.1045, "step": 6787 }, { "epoch": 0.5284803659189334, "grad_norm": 0.9196511826298572, "learning_rate": 1.881514043233342e-05, "loss": 0.1026, "step": 6788 }, { "epoch": 0.528558221011143, "grad_norm": 0.9417383279730991, "learning_rate": 1.8810157431655133e-05, "loss": 0.0888, "step": 6789 }, { "epoch": 0.5286360761033526, "grad_norm": 1.0116422628223023, "learning_rate": 1.8805174505098435e-05, "loss": 0.1201, "step": 6790 }, { "epoch": 0.5287139311955623, "grad_norm": 0.950226847201387, "learning_rate": 1.8800191652973764e-05, "loss": 0.1196, "step": 6791 }, { "epoch": 0.5287917862877719, "grad_norm": 0.9307561584284572, "learning_rate": 1.8795208875591508e-05, "loss": 0.1041, "step": 6792 }, { "epoch": 0.5288696413799815, "grad_norm": 1.0278124128389692, "learning_rate": 1.8790226173262073e-05, "loss": 0.1062, "step": 6793 }, { "epoch": 0.5289474964721911, "grad_norm": 0.9488584654288311, "learning_rate": 1.8785243546295866e-05, "loss": 0.1037, "step": 6794 }, { "epoch": 0.5290253515644008, "grad_norm": 0.8920992951738643, "learning_rate": 1.8780260995003265e-05, "loss": 0.1094, "step": 6795 }, { "epoch": 0.5291032066566104, "grad_norm": 0.9170600587741677, "learning_rate": 1.8775278519694686e-05, "loss": 0.0875, "step": 6796 }, { "epoch": 0.52918106174882, "grad_norm": 1.013056616807538, "learning_rate": 1.8770296120680492e-05, "loss": 0.0881, "step": 6797 }, { "epoch": 0.5292589168410297, "grad_norm": 0.8718700383179173, "learning_rate": 1.8765313798271067e-05, "loss": 0.1008, "step": 6798 }, { "epoch": 0.5293367719332392, "grad_norm": 0.8300236490133256, "learning_rate": 1.8760331552776793e-05, "loss": 0.0888, "step": 6799 }, { "epoch": 0.5294146270254488, "grad_norm": 0.9614065788093327, "learning_rate": 1.8755349384508032e-05, "loss": 0.1297, "step": 6800 }, { "epoch": 0.5294146270254488, "eval_loss": 0.01359557081013918, "eval_runtime": 162.7244, "eval_samples_per_second": 17.699, "eval_steps_per_second": 0.633, "step": 6800 }, { "epoch": 0.5294924821176585, "grad_norm": 0.9341998590862748, "learning_rate": 1.8750367293775155e-05, "loss": 0.1057, "step": 6801 }, { "epoch": 0.5295703372098681, "grad_norm": 0.9345738441513938, "learning_rate": 1.8745385280888527e-05, "loss": 0.1174, "step": 6802 }, { "epoch": 0.5296481923020777, "grad_norm": 1.0868106496333132, "learning_rate": 1.8740403346158492e-05, "loss": 0.1483, "step": 6803 }, { "epoch": 0.5297260473942874, "grad_norm": 0.8809931612659747, "learning_rate": 1.8735421489895414e-05, "loss": 0.0963, "step": 6804 }, { "epoch": 0.529803902486497, "grad_norm": 0.9291155456853306, "learning_rate": 1.8730439712409626e-05, "loss": 0.126, "step": 6805 }, { "epoch": 0.5298817575787066, "grad_norm": 0.986059828856578, "learning_rate": 1.872545801401148e-05, "loss": 0.1269, "step": 6806 }, { "epoch": 0.5299596126709163, "grad_norm": 0.9546178630348252, "learning_rate": 1.872047639501131e-05, "loss": 0.1218, "step": 6807 }, { "epoch": 0.5300374677631259, "grad_norm": 0.9751794928705875, "learning_rate": 1.8715494855719447e-05, "loss": 0.1037, "step": 6808 }, { "epoch": 0.5301153228553355, "grad_norm": 0.9699211356353125, "learning_rate": 1.8710513396446214e-05, "loss": 0.1132, "step": 6809 }, { "epoch": 0.5301931779475452, "grad_norm": 0.9056670068098679, "learning_rate": 1.8705532017501933e-05, "loss": 0.1339, "step": 6810 }, { "epoch": 0.5302710330397548, "grad_norm": 0.8794675767209605, "learning_rate": 1.8700550719196927e-05, "loss": 0.0949, "step": 6811 }, { "epoch": 0.5303488881319643, "grad_norm": 0.8800979875193485, "learning_rate": 1.8695569501841505e-05, "loss": 0.0953, "step": 6812 }, { "epoch": 0.530426743224174, "grad_norm": 0.9374658891403774, "learning_rate": 1.869058836574597e-05, "loss": 0.1213, "step": 6813 }, { "epoch": 0.5305045983163836, "grad_norm": 0.9596848557261353, "learning_rate": 1.8685607311220625e-05, "loss": 0.1262, "step": 6814 }, { "epoch": 0.5305824534085932, "grad_norm": 0.8875598866291594, "learning_rate": 1.8680626338575762e-05, "loss": 0.0914, "step": 6815 }, { "epoch": 0.5306603085008029, "grad_norm": 0.9060768785209137, "learning_rate": 1.867564544812168e-05, "loss": 0.1241, "step": 6816 }, { "epoch": 0.5307381635930125, "grad_norm": 0.9187866328700528, "learning_rate": 1.8670664640168662e-05, "loss": 0.0967, "step": 6817 }, { "epoch": 0.5308160186852221, "grad_norm": 0.8450686967832786, "learning_rate": 1.8665683915026992e-05, "loss": 0.0694, "step": 6818 }, { "epoch": 0.5308938737774318, "grad_norm": 0.9106720858426557, "learning_rate": 1.866070327300694e-05, "loss": 0.1093, "step": 6819 }, { "epoch": 0.5309717288696414, "grad_norm": 0.8381839857452208, "learning_rate": 1.8655722714418777e-05, "loss": 0.0872, "step": 6820 }, { "epoch": 0.531049583961851, "grad_norm": 0.9356983709423276, "learning_rate": 1.8650742239572765e-05, "loss": 0.0996, "step": 6821 }, { "epoch": 0.5311274390540607, "grad_norm": 0.9825325881506487, "learning_rate": 1.8645761848779177e-05, "loss": 0.1325, "step": 6822 }, { "epoch": 0.5312052941462703, "grad_norm": 0.925073919431719, "learning_rate": 1.8640781542348258e-05, "loss": 0.1135, "step": 6823 }, { "epoch": 0.5312831492384799, "grad_norm": 0.8888402391641204, "learning_rate": 1.8635801320590263e-05, "loss": 0.0871, "step": 6824 }, { "epoch": 0.5313610043306896, "grad_norm": 0.9976757934689251, "learning_rate": 1.8630821183815432e-05, "loss": 0.1397, "step": 6825 }, { "epoch": 0.5314388594228991, "grad_norm": 0.9579230562050174, "learning_rate": 1.8625841132334e-05, "loss": 0.1122, "step": 6826 }, { "epoch": 0.5315167145151087, "grad_norm": 0.905450265123032, "learning_rate": 1.862086116645621e-05, "loss": 0.1022, "step": 6827 }, { "epoch": 0.5315945696073183, "grad_norm": 0.9260738483475335, "learning_rate": 1.8615881286492286e-05, "loss": 0.1037, "step": 6828 }, { "epoch": 0.531672424699528, "grad_norm": 0.9598420347977076, "learning_rate": 1.8610901492752457e-05, "loss": 0.1326, "step": 6829 }, { "epoch": 0.5317502797917376, "grad_norm": 0.9509920194163151, "learning_rate": 1.8605921785546932e-05, "loss": 0.1192, "step": 6830 }, { "epoch": 0.5318281348839472, "grad_norm": 0.8132130561961965, "learning_rate": 1.860094216518592e-05, "loss": 0.09, "step": 6831 }, { "epoch": 0.5319059899761569, "grad_norm": 0.9231827626099297, "learning_rate": 1.8595962631979646e-05, "loss": 0.1151, "step": 6832 }, { "epoch": 0.5319838450683665, "grad_norm": 0.8001511470333141, "learning_rate": 1.85909831862383e-05, "loss": 0.0752, "step": 6833 }, { "epoch": 0.5320617001605761, "grad_norm": 0.9246316001696893, "learning_rate": 1.858600382827207e-05, "loss": 0.1151, "step": 6834 }, { "epoch": 0.5321395552527858, "grad_norm": 0.8331035487186997, "learning_rate": 1.8581024558391162e-05, "loss": 0.0732, "step": 6835 }, { "epoch": 0.5322174103449954, "grad_norm": 0.9380510374420283, "learning_rate": 1.857604537690575e-05, "loss": 0.0906, "step": 6836 }, { "epoch": 0.532295265437205, "grad_norm": 0.8523376841548241, "learning_rate": 1.857106628412602e-05, "loss": 0.072, "step": 6837 }, { "epoch": 0.5323731205294147, "grad_norm": 0.9250343862378462, "learning_rate": 1.8566087280362145e-05, "loss": 0.1048, "step": 6838 }, { "epoch": 0.5324509756216242, "grad_norm": 0.8341802581479584, "learning_rate": 1.8561108365924294e-05, "loss": 0.0916, "step": 6839 }, { "epoch": 0.5325288307138338, "grad_norm": 0.9017056194728194, "learning_rate": 1.855612954112263e-05, "loss": 0.0982, "step": 6840 }, { "epoch": 0.5326066858060435, "grad_norm": 1.0443952643452186, "learning_rate": 1.8551150806267297e-05, "loss": 0.1563, "step": 6841 }, { "epoch": 0.5326845408982531, "grad_norm": 0.9556470317400887, "learning_rate": 1.8546172161668468e-05, "loss": 0.1191, "step": 6842 }, { "epoch": 0.5327623959904627, "grad_norm": 0.913557626100298, "learning_rate": 1.8541193607636278e-05, "loss": 0.1051, "step": 6843 }, { "epoch": 0.5328402510826724, "grad_norm": 0.9189405164756724, "learning_rate": 1.8536215144480868e-05, "loss": 0.1099, "step": 6844 }, { "epoch": 0.532918106174882, "grad_norm": 0.9938992811137836, "learning_rate": 1.8531236772512383e-05, "loss": 0.0984, "step": 6845 }, { "epoch": 0.5329959612670916, "grad_norm": 0.8898930159241214, "learning_rate": 1.852625849204093e-05, "loss": 0.1025, "step": 6846 }, { "epoch": 0.5330738163593013, "grad_norm": 0.9919887421286795, "learning_rate": 1.852128030337665e-05, "loss": 0.1173, "step": 6847 }, { "epoch": 0.5331516714515109, "grad_norm": 0.8403047646296644, "learning_rate": 1.851630220682966e-05, "loss": 0.0937, "step": 6848 }, { "epoch": 0.5332295265437205, "grad_norm": 0.9575294556339528, "learning_rate": 1.851132420271007e-05, "loss": 0.1004, "step": 6849 }, { "epoch": 0.5333073816359302, "grad_norm": 0.9451734574150218, "learning_rate": 1.8506346291327982e-05, "loss": 0.1135, "step": 6850 }, { "epoch": 0.5333073816359302, "eval_loss": 0.013109379447996616, "eval_runtime": 162.2038, "eval_samples_per_second": 17.755, "eval_steps_per_second": 0.635, "step": 6850 }, { "epoch": 0.5333852367281398, "grad_norm": 0.965143868284336, "learning_rate": 1.8501368472993492e-05, "loss": 0.1144, "step": 6851 }, { "epoch": 0.5334630918203493, "grad_norm": 0.9311415414087507, "learning_rate": 1.8496390748016707e-05, "loss": 0.1156, "step": 6852 }, { "epoch": 0.533540946912559, "grad_norm": 0.8814753909936942, "learning_rate": 1.8491413116707712e-05, "loss": 0.1199, "step": 6853 }, { "epoch": 0.5336188020047686, "grad_norm": 0.915615626075536, "learning_rate": 1.8486435579376596e-05, "loss": 0.1144, "step": 6854 }, { "epoch": 0.5336966570969782, "grad_norm": 0.8922601462497252, "learning_rate": 1.8481458136333423e-05, "loss": 0.1145, "step": 6855 }, { "epoch": 0.5337745121891879, "grad_norm": 0.9119484299648751, "learning_rate": 1.847648078788826e-05, "loss": 0.085, "step": 6856 }, { "epoch": 0.5338523672813975, "grad_norm": 1.0370201989839662, "learning_rate": 1.8471503534351195e-05, "loss": 0.1166, "step": 6857 }, { "epoch": 0.5339302223736071, "grad_norm": 0.9867797770050714, "learning_rate": 1.8466526376032275e-05, "loss": 0.0903, "step": 6858 }, { "epoch": 0.5340080774658168, "grad_norm": 0.9410636697682015, "learning_rate": 1.8461549313241552e-05, "loss": 0.1183, "step": 6859 }, { "epoch": 0.5340859325580264, "grad_norm": 0.9283885379069033, "learning_rate": 1.8456572346289077e-05, "loss": 0.0776, "step": 6860 }, { "epoch": 0.534163787650236, "grad_norm": 0.9624051449816825, "learning_rate": 1.845159547548489e-05, "loss": 0.1021, "step": 6861 }, { "epoch": 0.5342416427424457, "grad_norm": 0.9689900607223011, "learning_rate": 1.8446618701139013e-05, "loss": 0.1149, "step": 6862 }, { "epoch": 0.5343194978346553, "grad_norm": 0.8910515872131257, "learning_rate": 1.8441642023561507e-05, "loss": 0.0974, "step": 6863 }, { "epoch": 0.5343973529268649, "grad_norm": 1.0384446797753282, "learning_rate": 1.8436665443062372e-05, "loss": 0.0918, "step": 6864 }, { "epoch": 0.5344752080190744, "grad_norm": 0.9602116262953386, "learning_rate": 1.8431688959951633e-05, "loss": 0.11, "step": 6865 }, { "epoch": 0.5345530631112841, "grad_norm": 0.8911636626070301, "learning_rate": 1.84267125745393e-05, "loss": 0.1397, "step": 6866 }, { "epoch": 0.5346309182034937, "grad_norm": 0.9147936453165836, "learning_rate": 1.8421736287135372e-05, "loss": 0.1134, "step": 6867 }, { "epoch": 0.5347087732957033, "grad_norm": 0.8963240904773278, "learning_rate": 1.841676009804986e-05, "loss": 0.0982, "step": 6868 }, { "epoch": 0.534786628387913, "grad_norm": 1.0416865025107078, "learning_rate": 1.8411784007592754e-05, "loss": 0.1559, "step": 6869 }, { "epoch": 0.5348644834801226, "grad_norm": 0.9827657775961207, "learning_rate": 1.840680801607404e-05, "loss": 0.1189, "step": 6870 }, { "epoch": 0.5349423385723322, "grad_norm": 0.9879510288143368, "learning_rate": 1.8401832123803692e-05, "loss": 0.1083, "step": 6871 }, { "epoch": 0.5350201936645419, "grad_norm": 1.0026279735882666, "learning_rate": 1.839685633109169e-05, "loss": 0.1211, "step": 6872 }, { "epoch": 0.5350980487567515, "grad_norm": 0.8594464190581729, "learning_rate": 1.839188063824801e-05, "loss": 0.0927, "step": 6873 }, { "epoch": 0.5351759038489611, "grad_norm": 0.9689796654867325, "learning_rate": 1.8386905045582602e-05, "loss": 0.1149, "step": 6874 }, { "epoch": 0.5352537589411708, "grad_norm": 0.9122280677098696, "learning_rate": 1.8381929553405432e-05, "loss": 0.1304, "step": 6875 }, { "epoch": 0.5353316140333804, "grad_norm": 0.9177659464925023, "learning_rate": 1.8376954162026443e-05, "loss": 0.0952, "step": 6876 }, { "epoch": 0.53540946912559, "grad_norm": 0.8221976378186385, "learning_rate": 1.837197887175558e-05, "loss": 0.0917, "step": 6877 }, { "epoch": 0.5354873242177997, "grad_norm": 0.9746988567677675, "learning_rate": 1.836700368290278e-05, "loss": 0.1181, "step": 6878 }, { "epoch": 0.5355651793100092, "grad_norm": 0.9836137554262098, "learning_rate": 1.8362028595777975e-05, "loss": 0.1345, "step": 6879 }, { "epoch": 0.5356430344022188, "grad_norm": 0.8556935126625579, "learning_rate": 1.8357053610691093e-05, "loss": 0.0781, "step": 6880 }, { "epoch": 0.5357208894944285, "grad_norm": 0.9458376639571879, "learning_rate": 1.8352078727952044e-05, "loss": 0.1099, "step": 6881 }, { "epoch": 0.5357987445866381, "grad_norm": 1.000493065623463, "learning_rate": 1.8347103947870743e-05, "loss": 0.15, "step": 6882 }, { "epoch": 0.5358765996788477, "grad_norm": 0.9576589137003855, "learning_rate": 1.83421292707571e-05, "loss": 0.1366, "step": 6883 }, { "epoch": 0.5359544547710574, "grad_norm": 0.8265589135717613, "learning_rate": 1.8337154696921015e-05, "loss": 0.1057, "step": 6884 }, { "epoch": 0.536032309863267, "grad_norm": 0.9809741449309725, "learning_rate": 1.8332180226672372e-05, "loss": 0.0898, "step": 6885 }, { "epoch": 0.5361101649554766, "grad_norm": 0.7816903655498385, "learning_rate": 1.832720586032106e-05, "loss": 0.0838, "step": 6886 }, { "epoch": 0.5361880200476863, "grad_norm": 0.9264911576533061, "learning_rate": 1.832223159817696e-05, "loss": 0.0885, "step": 6887 }, { "epoch": 0.5362658751398959, "grad_norm": 0.9321685976001737, "learning_rate": 1.8317257440549948e-05, "loss": 0.0836, "step": 6888 }, { "epoch": 0.5363437302321055, "grad_norm": 0.8850112224038434, "learning_rate": 1.831228338774989e-05, "loss": 0.0861, "step": 6889 }, { "epoch": 0.5364215853243152, "grad_norm": 0.8849074262769971, "learning_rate": 1.830730944008664e-05, "loss": 0.1047, "step": 6890 }, { "epoch": 0.5364994404165248, "grad_norm": 0.9700647076331287, "learning_rate": 1.8302335597870056e-05, "loss": 0.1039, "step": 6891 }, { "epoch": 0.5365772955087343, "grad_norm": 0.8048436436411585, "learning_rate": 1.829736186140998e-05, "loss": 0.0786, "step": 6892 }, { "epoch": 0.536655150600944, "grad_norm": 0.8023558234428492, "learning_rate": 1.829238823101626e-05, "loss": 0.0952, "step": 6893 }, { "epoch": 0.5367330056931536, "grad_norm": 0.9322743682639771, "learning_rate": 1.828741470699873e-05, "loss": 0.0987, "step": 6894 }, { "epoch": 0.5368108607853632, "grad_norm": 0.9668122645815047, "learning_rate": 1.8282441289667214e-05, "loss": 0.1044, "step": 6895 }, { "epoch": 0.5368887158775729, "grad_norm": 0.9026405839039771, "learning_rate": 1.827746797933153e-05, "loss": 0.1003, "step": 6896 }, { "epoch": 0.5369665709697825, "grad_norm": 0.8788911001110425, "learning_rate": 1.8272494776301497e-05, "loss": 0.0888, "step": 6897 }, { "epoch": 0.5370444260619921, "grad_norm": 0.8852995334453925, "learning_rate": 1.8267521680886915e-05, "loss": 0.1038, "step": 6898 }, { "epoch": 0.5371222811542018, "grad_norm": 0.8675341941035709, "learning_rate": 1.8262548693397594e-05, "loss": 0.095, "step": 6899 }, { "epoch": 0.5372001362464114, "grad_norm": 0.9492568138596085, "learning_rate": 1.8257575814143316e-05, "loss": 0.1377, "step": 6900 }, { "epoch": 0.5372001362464114, "eval_loss": 0.013025302439928055, "eval_runtime": 162.6002, "eval_samples_per_second": 17.712, "eval_steps_per_second": 0.633, "step": 6900 }, { "epoch": 0.537277991338621, "grad_norm": 0.863954742720886, "learning_rate": 1.825260304343388e-05, "loss": 0.0999, "step": 6901 }, { "epoch": 0.5373558464308306, "grad_norm": 0.9441899020940906, "learning_rate": 1.8247630381579066e-05, "loss": 0.1073, "step": 6902 }, { "epoch": 0.5374337015230403, "grad_norm": 0.9188568466049839, "learning_rate": 1.8242657828888627e-05, "loss": 0.0856, "step": 6903 }, { "epoch": 0.5375115566152499, "grad_norm": 0.8768953429235292, "learning_rate": 1.823768538567236e-05, "loss": 0.0811, "step": 6904 }, { "epoch": 0.5375894117074594, "grad_norm": 0.9337306626982121, "learning_rate": 1.823271305224e-05, "loss": 0.0963, "step": 6905 }, { "epoch": 0.5376672667996691, "grad_norm": 0.9223318239052031, "learning_rate": 1.8227740828901314e-05, "loss": 0.0893, "step": 6906 }, { "epoch": 0.5377451218918787, "grad_norm": 0.9264702623431534, "learning_rate": 1.822276871596605e-05, "loss": 0.1131, "step": 6907 }, { "epoch": 0.5378229769840883, "grad_norm": 0.9006483330609899, "learning_rate": 1.8217796713743927e-05, "loss": 0.0784, "step": 6908 }, { "epoch": 0.537900832076298, "grad_norm": 0.8060299133793067, "learning_rate": 1.8212824822544698e-05, "loss": 0.0654, "step": 6909 }, { "epoch": 0.5379786871685076, "grad_norm": 0.899082380922222, "learning_rate": 1.8207853042678087e-05, "loss": 0.1021, "step": 6910 }, { "epoch": 0.5380565422607172, "grad_norm": 0.9121761672094929, "learning_rate": 1.8202881374453807e-05, "loss": 0.0893, "step": 6911 }, { "epoch": 0.5381343973529269, "grad_norm": 1.0223640963697336, "learning_rate": 1.819790981818157e-05, "loss": 0.1169, "step": 6912 }, { "epoch": 0.5382122524451365, "grad_norm": 0.8710197661873832, "learning_rate": 1.819293837417107e-05, "loss": 0.104, "step": 6913 }, { "epoch": 0.5382901075373461, "grad_norm": 0.8524045224585527, "learning_rate": 1.8187967042732025e-05, "loss": 0.0888, "step": 6914 }, { "epoch": 0.5383679626295558, "grad_norm": 0.9699213867389187, "learning_rate": 1.8182995824174115e-05, "loss": 0.1106, "step": 6915 }, { "epoch": 0.5384458177217654, "grad_norm": 0.9934592426980897, "learning_rate": 1.817802471880703e-05, "loss": 0.091, "step": 6916 }, { "epoch": 0.538523672813975, "grad_norm": 0.8624951868408984, "learning_rate": 1.8173053726940435e-05, "loss": 0.1021, "step": 6917 }, { "epoch": 0.5386015279061847, "grad_norm": 0.9156476568043037, "learning_rate": 1.8168082848883996e-05, "loss": 0.1386, "step": 6918 }, { "epoch": 0.5386793829983942, "grad_norm": 0.8443409024742977, "learning_rate": 1.8163112084947398e-05, "loss": 0.0908, "step": 6919 }, { "epoch": 0.5387572380906038, "grad_norm": 0.898677844321832, "learning_rate": 1.8158141435440282e-05, "loss": 0.1036, "step": 6920 }, { "epoch": 0.5388350931828135, "grad_norm": 0.9696961192864987, "learning_rate": 1.8153170900672293e-05, "loss": 0.1058, "step": 6921 }, { "epoch": 0.5389129482750231, "grad_norm": 0.8887197020546732, "learning_rate": 1.814820048095308e-05, "loss": 0.1131, "step": 6922 }, { "epoch": 0.5389908033672327, "grad_norm": 0.9775965599417339, "learning_rate": 1.8143230176592263e-05, "loss": 0.0864, "step": 6923 }, { "epoch": 0.5390686584594424, "grad_norm": 0.9843219101136089, "learning_rate": 1.8138259987899483e-05, "loss": 0.1033, "step": 6924 }, { "epoch": 0.539146513551652, "grad_norm": 0.9035528640712372, "learning_rate": 1.8133289915184366e-05, "loss": 0.0739, "step": 6925 }, { "epoch": 0.5392243686438616, "grad_norm": 0.9058782149503093, "learning_rate": 1.81283199587565e-05, "loss": 0.0928, "step": 6926 }, { "epoch": 0.5393022237360713, "grad_norm": 0.8913478292991022, "learning_rate": 1.8123350118925513e-05, "loss": 0.0941, "step": 6927 }, { "epoch": 0.5393800788282809, "grad_norm": 0.8757405509622348, "learning_rate": 1.8118380396000975e-05, "loss": 0.0799, "step": 6928 }, { "epoch": 0.5394579339204905, "grad_norm": 0.8730726997953798, "learning_rate": 1.8113410790292513e-05, "loss": 0.1175, "step": 6929 }, { "epoch": 0.5395357890127002, "grad_norm": 0.8645965146068876, "learning_rate": 1.8108441302109683e-05, "loss": 0.0811, "step": 6930 }, { "epoch": 0.5396136441049098, "grad_norm": 0.9542430353822957, "learning_rate": 1.810347193176207e-05, "loss": 0.108, "step": 6931 }, { "epoch": 0.5396914991971193, "grad_norm": 0.8301717619001711, "learning_rate": 1.809850267955924e-05, "loss": 0.0832, "step": 6932 }, { "epoch": 0.539769354289329, "grad_norm": 0.8528232982429508, "learning_rate": 1.8093533545810747e-05, "loss": 0.0935, "step": 6933 }, { "epoch": 0.5398472093815386, "grad_norm": 0.9411441045001656, "learning_rate": 1.808856453082617e-05, "loss": 0.0957, "step": 6934 }, { "epoch": 0.5399250644737482, "grad_norm": 0.9048234934647172, "learning_rate": 1.808359563491503e-05, "loss": 0.1055, "step": 6935 }, { "epoch": 0.5400029195659578, "grad_norm": 0.8820122509808449, "learning_rate": 1.8078626858386873e-05, "loss": 0.0957, "step": 6936 }, { "epoch": 0.5400807746581675, "grad_norm": 0.9090598269614868, "learning_rate": 1.8073658201551234e-05, "loss": 0.1059, "step": 6937 }, { "epoch": 0.5401586297503771, "grad_norm": 0.8262130937654832, "learning_rate": 1.8068689664717635e-05, "loss": 0.0809, "step": 6938 }, { "epoch": 0.5402364848425867, "grad_norm": 0.790550619831216, "learning_rate": 1.8063721248195584e-05, "loss": 0.0828, "step": 6939 }, { "epoch": 0.5403143399347964, "grad_norm": 0.8933167634054288, "learning_rate": 1.8058752952294605e-05, "loss": 0.0827, "step": 6940 }, { "epoch": 0.540392195027006, "grad_norm": 0.9041878571919852, "learning_rate": 1.805378477732419e-05, "loss": 0.0938, "step": 6941 }, { "epoch": 0.5404700501192156, "grad_norm": 0.8720773105084382, "learning_rate": 1.804881672359384e-05, "loss": 0.0835, "step": 6942 }, { "epoch": 0.5405479052114253, "grad_norm": 0.8770390093344098, "learning_rate": 1.8043848791413033e-05, "loss": 0.1073, "step": 6943 }, { "epoch": 0.5406257603036349, "grad_norm": 0.8846183719186953, "learning_rate": 1.803888098109125e-05, "loss": 0.116, "step": 6944 }, { "epoch": 0.5407036153958444, "grad_norm": 0.9052912499757789, "learning_rate": 1.8033913292937965e-05, "loss": 0.1032, "step": 6945 }, { "epoch": 0.5407814704880541, "grad_norm": 1.1068750287542606, "learning_rate": 1.802894572726264e-05, "loss": 0.1269, "step": 6946 }, { "epoch": 0.5408593255802637, "grad_norm": 0.9439089137959182, "learning_rate": 1.8023978284374736e-05, "loss": 0.1119, "step": 6947 }, { "epoch": 0.5409371806724733, "grad_norm": 0.8445141988840872, "learning_rate": 1.801901096458369e-05, "loss": 0.0773, "step": 6948 }, { "epoch": 0.541015035764683, "grad_norm": 0.8848976958639364, "learning_rate": 1.8014043768198952e-05, "loss": 0.0921, "step": 6949 }, { "epoch": 0.5410928908568926, "grad_norm": 0.8497684366015749, "learning_rate": 1.8009076695529952e-05, "loss": 0.0927, "step": 6950 }, { "epoch": 0.5410928908568926, "eval_loss": 0.01281512901186943, "eval_runtime": 162.6879, "eval_samples_per_second": 17.703, "eval_steps_per_second": 0.633, "step": 6950 }, { "epoch": 0.5411707459491022, "grad_norm": 0.8652827294981055, "learning_rate": 1.8004109746886118e-05, "loss": 0.0837, "step": 6951 }, { "epoch": 0.5412486010413119, "grad_norm": 0.8812719985323014, "learning_rate": 1.7999142922576865e-05, "loss": 0.0744, "step": 6952 }, { "epoch": 0.5413264561335215, "grad_norm": 0.9881697657058477, "learning_rate": 1.7994176222911603e-05, "loss": 0.1216, "step": 6953 }, { "epoch": 0.5414043112257311, "grad_norm": 0.9999613799869173, "learning_rate": 1.7989209648199727e-05, "loss": 0.1304, "step": 6954 }, { "epoch": 0.5414821663179408, "grad_norm": 0.9741459849549539, "learning_rate": 1.7984243198750647e-05, "loss": 0.1137, "step": 6955 }, { "epoch": 0.5415600214101504, "grad_norm": 0.989501139064821, "learning_rate": 1.7979276874873738e-05, "loss": 0.1306, "step": 6956 }, { "epoch": 0.54163787650236, "grad_norm": 0.9082377876926424, "learning_rate": 1.7974310676878383e-05, "loss": 0.0946, "step": 6957 }, { "epoch": 0.5417157315945696, "grad_norm": 0.866946539631039, "learning_rate": 1.7969344605073952e-05, "loss": 0.103, "step": 6958 }, { "epoch": 0.5417935866867792, "grad_norm": 0.8874148256132584, "learning_rate": 1.7964378659769802e-05, "loss": 0.1141, "step": 6959 }, { "epoch": 0.5418714417789888, "grad_norm": 0.9047946670110253, "learning_rate": 1.7959412841275297e-05, "loss": 0.083, "step": 6960 }, { "epoch": 0.5419492968711985, "grad_norm": 0.8482486282581866, "learning_rate": 1.795444714989978e-05, "loss": 0.0915, "step": 6961 }, { "epoch": 0.5420271519634081, "grad_norm": 0.8613843375751855, "learning_rate": 1.794948158595259e-05, "loss": 0.0838, "step": 6962 }, { "epoch": 0.5421050070556177, "grad_norm": 0.8342619358859482, "learning_rate": 1.794451614974306e-05, "loss": 0.0966, "step": 6963 }, { "epoch": 0.5421828621478274, "grad_norm": 0.9614592035453012, "learning_rate": 1.7939550841580508e-05, "loss": 0.1252, "step": 6964 }, { "epoch": 0.542260717240037, "grad_norm": 0.869240396692559, "learning_rate": 1.793458566177426e-05, "loss": 0.101, "step": 6965 }, { "epoch": 0.5423385723322466, "grad_norm": 0.7549261244637371, "learning_rate": 1.7929620610633614e-05, "loss": 0.0759, "step": 6966 }, { "epoch": 0.5424164274244563, "grad_norm": 0.9491779989047253, "learning_rate": 1.7924655688467873e-05, "loss": 0.1109, "step": 6967 }, { "epoch": 0.5424942825166659, "grad_norm": 0.8079631888088743, "learning_rate": 1.791969089558633e-05, "loss": 0.0683, "step": 6968 }, { "epoch": 0.5425721376088755, "grad_norm": 0.8403965821716933, "learning_rate": 1.7914726232298258e-05, "loss": 0.0753, "step": 6969 }, { "epoch": 0.5426499927010852, "grad_norm": 0.9117497576916354, "learning_rate": 1.7909761698912945e-05, "loss": 0.0955, "step": 6970 }, { "epoch": 0.5427278477932947, "grad_norm": 0.8361315479020581, "learning_rate": 1.7904797295739653e-05, "loss": 0.0827, "step": 6971 }, { "epoch": 0.5428057028855043, "grad_norm": 0.8063832120184886, "learning_rate": 1.7899833023087643e-05, "loss": 0.0841, "step": 6972 }, { "epoch": 0.5428835579777139, "grad_norm": 0.8348124482178831, "learning_rate": 1.7894868881266163e-05, "loss": 0.0872, "step": 6973 }, { "epoch": 0.5429614130699236, "grad_norm": 0.9128835235544859, "learning_rate": 1.7889904870584462e-05, "loss": 0.0944, "step": 6974 }, { "epoch": 0.5430392681621332, "grad_norm": 0.8925352257518643, "learning_rate": 1.7884940991351757e-05, "loss": 0.0926, "step": 6975 }, { "epoch": 0.5431171232543428, "grad_norm": 0.9710391044014359, "learning_rate": 1.787997724387729e-05, "loss": 0.1324, "step": 6976 }, { "epoch": 0.5431949783465525, "grad_norm": 0.8442260082239935, "learning_rate": 1.787501362847028e-05, "loss": 0.0751, "step": 6977 }, { "epoch": 0.5432728334387621, "grad_norm": 0.9180068313502259, "learning_rate": 1.7870050145439938e-05, "loss": 0.1087, "step": 6978 }, { "epoch": 0.5433506885309717, "grad_norm": 0.9069117401467565, "learning_rate": 1.7865086795095453e-05, "loss": 0.0766, "step": 6979 }, { "epoch": 0.5434285436231814, "grad_norm": 0.849299003781959, "learning_rate": 1.786012357774602e-05, "loss": 0.0893, "step": 6980 }, { "epoch": 0.543506398715391, "grad_norm": 0.7562928554748383, "learning_rate": 1.7855160493700836e-05, "loss": 0.0659, "step": 6981 }, { "epoch": 0.5435842538076006, "grad_norm": 0.94719427036112, "learning_rate": 1.7850197543269073e-05, "loss": 0.1612, "step": 6982 }, { "epoch": 0.5436621088998103, "grad_norm": 0.773139255070054, "learning_rate": 1.7845234726759903e-05, "loss": 0.0756, "step": 6983 }, { "epoch": 0.5437399639920198, "grad_norm": 0.835029004537553, "learning_rate": 1.7840272044482473e-05, "loss": 0.0818, "step": 6984 }, { "epoch": 0.5438178190842294, "grad_norm": 0.8962231803651164, "learning_rate": 1.7835309496745938e-05, "loss": 0.1242, "step": 6985 }, { "epoch": 0.5438956741764391, "grad_norm": 0.8052391259658768, "learning_rate": 1.7830347083859453e-05, "loss": 0.0933, "step": 6986 }, { "epoch": 0.5439735292686487, "grad_norm": 0.8224144996834463, "learning_rate": 1.7825384806132152e-05, "loss": 0.0761, "step": 6987 }, { "epoch": 0.5440513843608583, "grad_norm": 0.9225554871362138, "learning_rate": 1.782042266387315e-05, "loss": 0.118, "step": 6988 }, { "epoch": 0.544129239453068, "grad_norm": 0.8589310493643475, "learning_rate": 1.7815460657391574e-05, "loss": 0.0776, "step": 6989 }, { "epoch": 0.5442070945452776, "grad_norm": 0.8610057313574444, "learning_rate": 1.7810498786996522e-05, "loss": 0.086, "step": 6990 }, { "epoch": 0.5442849496374872, "grad_norm": 0.8907317217946477, "learning_rate": 1.7805537052997112e-05, "loss": 0.0824, "step": 6991 }, { "epoch": 0.5443628047296969, "grad_norm": 0.7450366159496807, "learning_rate": 1.780057545570243e-05, "loss": 0.064, "step": 6992 }, { "epoch": 0.5444406598219065, "grad_norm": 0.8460152107049722, "learning_rate": 1.7795613995421558e-05, "loss": 0.098, "step": 6993 }, { "epoch": 0.5445185149141161, "grad_norm": 0.8723604364045277, "learning_rate": 1.7790652672463574e-05, "loss": 0.0933, "step": 6994 }, { "epoch": 0.5445963700063258, "grad_norm": 0.9789067029445151, "learning_rate": 1.7785691487137532e-05, "loss": 0.1326, "step": 6995 }, { "epoch": 0.5446742250985354, "grad_norm": 0.8221042812191806, "learning_rate": 1.7780730439752512e-05, "loss": 0.1023, "step": 6996 }, { "epoch": 0.544752080190745, "grad_norm": 0.8316560999918474, "learning_rate": 1.7775769530617558e-05, "loss": 0.0883, "step": 6997 }, { "epoch": 0.5448299352829546, "grad_norm": 0.8644883421315359, "learning_rate": 1.77708087600417e-05, "loss": 0.092, "step": 6998 }, { "epoch": 0.5449077903751642, "grad_norm": 0.7852943706205423, "learning_rate": 1.7765848128333982e-05, "loss": 0.0754, "step": 6999 }, { "epoch": 0.5449856454673738, "grad_norm": 0.871468353007378, "learning_rate": 1.7760887635803414e-05, "loss": 0.1032, "step": 7000 }, { "epoch": 0.5449856454673738, "eval_loss": 0.012474551796913147, "eval_runtime": 163.1621, "eval_samples_per_second": 17.651, "eval_steps_per_second": 0.631, "step": 7000 }, { "epoch": 0.5450635005595835, "grad_norm": 0.9273598252703248, "learning_rate": 1.7755927282759037e-05, "loss": 0.0883, "step": 7001 }, { "epoch": 0.5451413556517931, "grad_norm": 0.9429306057822019, "learning_rate": 1.7750967069509834e-05, "loss": 0.1227, "step": 7002 }, { "epoch": 0.5452192107440027, "grad_norm": 0.9197444735670995, "learning_rate": 1.7746006996364814e-05, "loss": 0.1109, "step": 7003 }, { "epoch": 0.5452970658362124, "grad_norm": 0.8506107347860458, "learning_rate": 1.7741047063632964e-05, "loss": 0.0878, "step": 7004 }, { "epoch": 0.545374920928422, "grad_norm": 0.9952660224856908, "learning_rate": 1.7736087271623258e-05, "loss": 0.076, "step": 7005 }, { "epoch": 0.5454527760206316, "grad_norm": 0.9015691799322482, "learning_rate": 1.7731127620644677e-05, "loss": 0.1166, "step": 7006 }, { "epoch": 0.5455306311128412, "grad_norm": 0.9100937475715983, "learning_rate": 1.772616811100618e-05, "loss": 0.0964, "step": 7007 }, { "epoch": 0.5456084862050509, "grad_norm": 0.850507095307022, "learning_rate": 1.7721208743016726e-05, "loss": 0.1069, "step": 7008 }, { "epoch": 0.5456863412972605, "grad_norm": 0.8390253562188237, "learning_rate": 1.7716249516985254e-05, "loss": 0.1201, "step": 7009 }, { "epoch": 0.54576419638947, "grad_norm": 0.9150909460898576, "learning_rate": 1.7711290433220697e-05, "loss": 0.1061, "step": 7010 }, { "epoch": 0.5458420514816797, "grad_norm": 0.813827443406727, "learning_rate": 1.7706331492031995e-05, "loss": 0.0801, "step": 7011 }, { "epoch": 0.5459199065738893, "grad_norm": 0.9094735676334806, "learning_rate": 1.7701372693728057e-05, "loss": 0.0803, "step": 7012 }, { "epoch": 0.5459977616660989, "grad_norm": 0.9351922163638279, "learning_rate": 1.7696414038617798e-05, "loss": 0.0901, "step": 7013 }, { "epoch": 0.5460756167583086, "grad_norm": 0.885510836141551, "learning_rate": 1.7691455527010112e-05, "loss": 0.1344, "step": 7014 }, { "epoch": 0.5461534718505182, "grad_norm": 0.7929776265541176, "learning_rate": 1.76864971592139e-05, "loss": 0.0704, "step": 7015 }, { "epoch": 0.5462313269427278, "grad_norm": 0.8306870737742315, "learning_rate": 1.7681538935538035e-05, "loss": 0.1205, "step": 7016 }, { "epoch": 0.5463091820349375, "grad_norm": 0.879545116971742, "learning_rate": 1.76765808562914e-05, "loss": 0.0868, "step": 7017 }, { "epoch": 0.5463870371271471, "grad_norm": 0.9124196351646642, "learning_rate": 1.7671622921782858e-05, "loss": 0.1071, "step": 7018 }, { "epoch": 0.5464648922193567, "grad_norm": 0.7735582723730661, "learning_rate": 1.7666665132321263e-05, "loss": 0.0854, "step": 7019 }, { "epoch": 0.5465427473115664, "grad_norm": 0.8215136258900733, "learning_rate": 1.7661707488215466e-05, "loss": 0.0885, "step": 7020 }, { "epoch": 0.546620602403776, "grad_norm": 0.9526276271113219, "learning_rate": 1.765674998977429e-05, "loss": 0.1036, "step": 7021 }, { "epoch": 0.5466984574959856, "grad_norm": 0.9716082719115723, "learning_rate": 1.7651792637306587e-05, "loss": 0.1399, "step": 7022 }, { "epoch": 0.5467763125881953, "grad_norm": 0.9098419249030089, "learning_rate": 1.764683543112116e-05, "loss": 0.1352, "step": 7023 }, { "epoch": 0.5468541676804048, "grad_norm": 0.9413900023369826, "learning_rate": 1.764187837152683e-05, "loss": 0.1151, "step": 7024 }, { "epoch": 0.5469320227726144, "grad_norm": 0.916658492703403, "learning_rate": 1.763692145883239e-05, "loss": 0.1079, "step": 7025 }, { "epoch": 0.5470098778648241, "grad_norm": 0.9207089428177186, "learning_rate": 1.763196469334663e-05, "loss": 0.1026, "step": 7026 }, { "epoch": 0.5470877329570337, "grad_norm": 0.906748848492229, "learning_rate": 1.7627008075378344e-05, "loss": 0.0921, "step": 7027 }, { "epoch": 0.5471655880492433, "grad_norm": 0.9431357055105213, "learning_rate": 1.76220516052363e-05, "loss": 0.086, "step": 7028 }, { "epoch": 0.547243443141453, "grad_norm": 0.9580689337709694, "learning_rate": 1.7617095283229265e-05, "loss": 0.122, "step": 7029 }, { "epoch": 0.5473212982336626, "grad_norm": 0.8340039578531234, "learning_rate": 1.7612139109665996e-05, "loss": 0.0905, "step": 7030 }, { "epoch": 0.5473991533258722, "grad_norm": 0.7840949749008274, "learning_rate": 1.760718308485523e-05, "loss": 0.0874, "step": 7031 }, { "epoch": 0.5474770084180819, "grad_norm": 0.9214559062694286, "learning_rate": 1.7602227209105712e-05, "loss": 0.0918, "step": 7032 }, { "epoch": 0.5475548635102915, "grad_norm": 0.9586619333065927, "learning_rate": 1.759727148272617e-05, "loss": 0.1109, "step": 7033 }, { "epoch": 0.5476327186025011, "grad_norm": 0.8021730639113792, "learning_rate": 1.7592315906025325e-05, "loss": 0.0716, "step": 7034 }, { "epoch": 0.5477105736947108, "grad_norm": 0.8312418644913091, "learning_rate": 1.7587360479311877e-05, "loss": 0.0994, "step": 7035 }, { "epoch": 0.5477884287869204, "grad_norm": 0.8679584796541718, "learning_rate": 1.7582405202894528e-05, "loss": 0.1035, "step": 7036 }, { "epoch": 0.54786628387913, "grad_norm": 0.8431291345442633, "learning_rate": 1.7577450077081976e-05, "loss": 0.1066, "step": 7037 }, { "epoch": 0.5479441389713396, "grad_norm": 0.8674605878549667, "learning_rate": 1.7572495102182895e-05, "loss": 0.1141, "step": 7038 }, { "epoch": 0.5480219940635492, "grad_norm": 0.8289280461832877, "learning_rate": 1.756754027850596e-05, "loss": 0.0948, "step": 7039 }, { "epoch": 0.5480998491557588, "grad_norm": 0.8467633579460817, "learning_rate": 1.7562585606359837e-05, "loss": 0.0788, "step": 7040 }, { "epoch": 0.5481777042479685, "grad_norm": 0.8610014267169739, "learning_rate": 1.755763108605316e-05, "loss": 0.1125, "step": 7041 }, { "epoch": 0.5482555593401781, "grad_norm": 0.8646874792920285, "learning_rate": 1.7552676717894595e-05, "loss": 0.0873, "step": 7042 }, { "epoch": 0.5483334144323877, "grad_norm": 0.8840986561966083, "learning_rate": 1.754772250219277e-05, "loss": 0.1037, "step": 7043 }, { "epoch": 0.5484112695245973, "grad_norm": 0.9901649990902227, "learning_rate": 1.7542768439256304e-05, "loss": 0.1141, "step": 7044 }, { "epoch": 0.548489124616807, "grad_norm": 0.8245569434898996, "learning_rate": 1.753781452939382e-05, "loss": 0.0806, "step": 7045 }, { "epoch": 0.5485669797090166, "grad_norm": 1.0053181517618168, "learning_rate": 1.75328607729139e-05, "loss": 0.1341, "step": 7046 }, { "epoch": 0.5486448348012262, "grad_norm": 0.859254837187079, "learning_rate": 1.7527907170125173e-05, "loss": 0.0932, "step": 7047 }, { "epoch": 0.5487226898934359, "grad_norm": 0.8095627578978907, "learning_rate": 1.7522953721336202e-05, "loss": 0.065, "step": 7048 }, { "epoch": 0.5488005449856455, "grad_norm": 0.8282651137172304, "learning_rate": 1.751800042685558e-05, "loss": 0.1038, "step": 7049 }, { "epoch": 0.548878400077855, "grad_norm": 0.8624486857497853, "learning_rate": 1.751304728699186e-05, "loss": 0.0939, "step": 7050 }, { "epoch": 0.548878400077855, "eval_loss": 0.012306218966841698, "eval_runtime": 162.6128, "eval_samples_per_second": 17.711, "eval_steps_per_second": 0.633, "step": 7050 }, { "epoch": 0.5489562551700647, "grad_norm": 0.8565070800254976, "learning_rate": 1.7508094302053602e-05, "loss": 0.087, "step": 7051 }, { "epoch": 0.5490341102622743, "grad_norm": 0.8256594322505145, "learning_rate": 1.750314147234935e-05, "loss": 0.0995, "step": 7052 }, { "epoch": 0.5491119653544839, "grad_norm": 0.8032876625991894, "learning_rate": 1.7498188798187656e-05, "loss": 0.0949, "step": 7053 }, { "epoch": 0.5491898204466936, "grad_norm": 0.933123606473754, "learning_rate": 1.7493236279877046e-05, "loss": 0.1094, "step": 7054 }, { "epoch": 0.5492676755389032, "grad_norm": 0.7763267852569387, "learning_rate": 1.7488283917726026e-05, "loss": 0.0885, "step": 7055 }, { "epoch": 0.5493455306311128, "grad_norm": 0.8398314072602998, "learning_rate": 1.7483331712043113e-05, "loss": 0.1069, "step": 7056 }, { "epoch": 0.5494233857233225, "grad_norm": 0.9317474566602136, "learning_rate": 1.7478379663136797e-05, "loss": 0.1269, "step": 7057 }, { "epoch": 0.5495012408155321, "grad_norm": 0.8420195815285489, "learning_rate": 1.7473427771315586e-05, "loss": 0.0973, "step": 7058 }, { "epoch": 0.5495790959077417, "grad_norm": 0.8698584927259652, "learning_rate": 1.7468476036887944e-05, "loss": 0.0986, "step": 7059 }, { "epoch": 0.5496569509999514, "grad_norm": 0.8713956428404052, "learning_rate": 1.746352446016235e-05, "loss": 0.0998, "step": 7060 }, { "epoch": 0.549734806092161, "grad_norm": 0.8532456988228059, "learning_rate": 1.7458573041447255e-05, "loss": 0.0922, "step": 7061 }, { "epoch": 0.5498126611843706, "grad_norm": 0.8160976415909901, "learning_rate": 1.745362178105111e-05, "loss": 0.0806, "step": 7062 }, { "epoch": 0.5498905162765803, "grad_norm": 0.8864102568059369, "learning_rate": 1.744867067928237e-05, "loss": 0.1146, "step": 7063 }, { "epoch": 0.5499683713687898, "grad_norm": 0.8884693264022413, "learning_rate": 1.744371973644945e-05, "loss": 0.1126, "step": 7064 }, { "epoch": 0.5500462264609994, "grad_norm": 0.8942105271634924, "learning_rate": 1.743876895286077e-05, "loss": 0.1158, "step": 7065 }, { "epoch": 0.5501240815532091, "grad_norm": 0.8607908848396189, "learning_rate": 1.743381832882475e-05, "loss": 0.0793, "step": 7066 }, { "epoch": 0.5502019366454187, "grad_norm": 0.8137379829811844, "learning_rate": 1.7428867864649777e-05, "loss": 0.082, "step": 7067 }, { "epoch": 0.5502797917376283, "grad_norm": 0.8433301317077514, "learning_rate": 1.7423917560644265e-05, "loss": 0.1089, "step": 7068 }, { "epoch": 0.550357646829838, "grad_norm": 0.7754853308023733, "learning_rate": 1.7418967417116577e-05, "loss": 0.0745, "step": 7069 }, { "epoch": 0.5504355019220476, "grad_norm": 0.9630786452966874, "learning_rate": 1.7414017434375083e-05, "loss": 0.114, "step": 7070 }, { "epoch": 0.5505133570142572, "grad_norm": 0.8888144837254338, "learning_rate": 1.740906761272815e-05, "loss": 0.0996, "step": 7071 }, { "epoch": 0.5505912121064669, "grad_norm": 0.8348115580075265, "learning_rate": 1.7404117952484122e-05, "loss": 0.0748, "step": 7072 }, { "epoch": 0.5506690671986765, "grad_norm": 1.0287512401557855, "learning_rate": 1.7399168453951343e-05, "loss": 0.1073, "step": 7073 }, { "epoch": 0.5507469222908861, "grad_norm": 0.9836272208253573, "learning_rate": 1.7394219117438152e-05, "loss": 0.1395, "step": 7074 }, { "epoch": 0.5508247773830958, "grad_norm": 0.9247568245867648, "learning_rate": 1.738926994325286e-05, "loss": 0.1213, "step": 7075 }, { "epoch": 0.5509026324753054, "grad_norm": 0.880494544409229, "learning_rate": 1.7384320931703774e-05, "loss": 0.0815, "step": 7076 }, { "epoch": 0.5509804875675149, "grad_norm": 0.8007185928610645, "learning_rate": 1.73793720830992e-05, "loss": 0.0794, "step": 7077 }, { "epoch": 0.5510583426597246, "grad_norm": 0.8247580707344366, "learning_rate": 1.737442339774743e-05, "loss": 0.0873, "step": 7078 }, { "epoch": 0.5511361977519342, "grad_norm": 0.8977769329230813, "learning_rate": 1.736947487595674e-05, "loss": 0.1232, "step": 7079 }, { "epoch": 0.5512140528441438, "grad_norm": 0.8791073125373249, "learning_rate": 1.7364526518035404e-05, "loss": 0.0882, "step": 7080 }, { "epoch": 0.5512919079363534, "grad_norm": 0.9252916701675564, "learning_rate": 1.7359578324291675e-05, "loss": 0.0944, "step": 7081 }, { "epoch": 0.5513697630285631, "grad_norm": 0.7967637688915871, "learning_rate": 1.73546302950338e-05, "loss": 0.067, "step": 7082 }, { "epoch": 0.5514476181207727, "grad_norm": 0.7598447373018179, "learning_rate": 1.734968243057003e-05, "loss": 0.0717, "step": 7083 }, { "epoch": 0.5515254732129823, "grad_norm": 0.8095328875414134, "learning_rate": 1.7344734731208584e-05, "loss": 0.0784, "step": 7084 }, { "epoch": 0.551603328305192, "grad_norm": 0.9956397511865126, "learning_rate": 1.733978719725769e-05, "loss": 0.1326, "step": 7085 }, { "epoch": 0.5516811833974016, "grad_norm": 0.8316053165457236, "learning_rate": 1.7334839829025544e-05, "loss": 0.0909, "step": 7086 }, { "epoch": 0.5517590384896112, "grad_norm": 0.7850015399155708, "learning_rate": 1.7329892626820342e-05, "loss": 0.087, "step": 7087 }, { "epoch": 0.5518368935818209, "grad_norm": 0.8767422502130588, "learning_rate": 1.732494559095029e-05, "loss": 0.1099, "step": 7088 }, { "epoch": 0.5519147486740305, "grad_norm": 0.8879396098177919, "learning_rate": 1.731999872172355e-05, "loss": 0.0974, "step": 7089 }, { "epoch": 0.55199260376624, "grad_norm": 0.8500205746806795, "learning_rate": 1.7315052019448295e-05, "loss": 0.1023, "step": 7090 }, { "epoch": 0.5520704588584497, "grad_norm": 0.9006665163509127, "learning_rate": 1.7310105484432677e-05, "loss": 0.1029, "step": 7091 }, { "epoch": 0.5521483139506593, "grad_norm": 0.857306275829497, "learning_rate": 1.7305159116984845e-05, "loss": 0.0935, "step": 7092 }, { "epoch": 0.5522261690428689, "grad_norm": 0.8318129849258402, "learning_rate": 1.7300212917412925e-05, "loss": 0.0992, "step": 7093 }, { "epoch": 0.5523040241350786, "grad_norm": 0.7597481219160661, "learning_rate": 1.729526688602506e-05, "loss": 0.0511, "step": 7094 }, { "epoch": 0.5523818792272882, "grad_norm": 0.8764449498978546, "learning_rate": 1.729032102312935e-05, "loss": 0.0836, "step": 7095 }, { "epoch": 0.5524597343194978, "grad_norm": 1.0077647362034887, "learning_rate": 1.7285375329033906e-05, "loss": 0.1434, "step": 7096 }, { "epoch": 0.5525375894117075, "grad_norm": 0.8139492988370175, "learning_rate": 1.728042980404682e-05, "loss": 0.083, "step": 7097 }, { "epoch": 0.5526154445039171, "grad_norm": 0.9036910997251495, "learning_rate": 1.7275484448476167e-05, "loss": 0.0975, "step": 7098 }, { "epoch": 0.5526932995961267, "grad_norm": 0.8654291775357104, "learning_rate": 1.7270539262630033e-05, "loss": 0.0853, "step": 7099 }, { "epoch": 0.5527711546883364, "grad_norm": 0.827034809114211, "learning_rate": 1.7265594246816473e-05, "loss": 0.0824, "step": 7100 }, { "epoch": 0.5527711546883364, "eval_loss": 0.012095167301595211, "eval_runtime": 162.4852, "eval_samples_per_second": 17.725, "eval_steps_per_second": 0.634, "step": 7100 }, { "epoch": 0.552849009780546, "grad_norm": 0.8357578047686611, "learning_rate": 1.726064940134354e-05, "loss": 0.1078, "step": 7101 }, { "epoch": 0.5529268648727556, "grad_norm": 0.8003488389220205, "learning_rate": 1.725570472651927e-05, "loss": 0.0924, "step": 7102 }, { "epoch": 0.5530047199649653, "grad_norm": 0.9073245482665059, "learning_rate": 1.7250760222651692e-05, "loss": 0.0852, "step": 7103 }, { "epoch": 0.5530825750571748, "grad_norm": 0.9184676824861305, "learning_rate": 1.7245815890048836e-05, "loss": 0.0875, "step": 7104 }, { "epoch": 0.5531604301493844, "grad_norm": 0.8992557561877796, "learning_rate": 1.7240871729018703e-05, "loss": 0.1019, "step": 7105 }, { "epoch": 0.5532382852415941, "grad_norm": 0.8746149927285214, "learning_rate": 1.7235927739869292e-05, "loss": 0.1113, "step": 7106 }, { "epoch": 0.5533161403338037, "grad_norm": 0.8918569113757837, "learning_rate": 1.72309839229086e-05, "loss": 0.097, "step": 7107 }, { "epoch": 0.5533939954260133, "grad_norm": 0.8863238135731921, "learning_rate": 1.7226040278444575e-05, "loss": 0.1063, "step": 7108 }, { "epoch": 0.553471850518223, "grad_norm": 0.873442840517129, "learning_rate": 1.722109680678521e-05, "loss": 0.1177, "step": 7109 }, { "epoch": 0.5535497056104326, "grad_norm": 0.8523100370344504, "learning_rate": 1.7216153508238455e-05, "loss": 0.097, "step": 7110 }, { "epoch": 0.5536275607026422, "grad_norm": 0.8256676395765524, "learning_rate": 1.7211210383112253e-05, "loss": 0.081, "step": 7111 }, { "epoch": 0.5537054157948519, "grad_norm": 0.856268158852439, "learning_rate": 1.7206267431714534e-05, "loss": 0.0772, "step": 7112 }, { "epoch": 0.5537832708870615, "grad_norm": 0.8746574450556156, "learning_rate": 1.7201324654353213e-05, "loss": 0.0911, "step": 7113 }, { "epoch": 0.5538611259792711, "grad_norm": 0.8982117862850187, "learning_rate": 1.7196382051336222e-05, "loss": 0.1042, "step": 7114 }, { "epoch": 0.5539389810714807, "grad_norm": 0.7931657248051449, "learning_rate": 1.7191439622971448e-05, "loss": 0.0967, "step": 7115 }, { "epoch": 0.5540168361636904, "grad_norm": 0.8160662270531959, "learning_rate": 1.7186497369566784e-05, "loss": 0.0817, "step": 7116 }, { "epoch": 0.5540946912558999, "grad_norm": 0.7871477574120815, "learning_rate": 1.7181555291430114e-05, "loss": 0.0771, "step": 7117 }, { "epoch": 0.5541725463481095, "grad_norm": 0.7515295387733689, "learning_rate": 1.7176613388869285e-05, "loss": 0.0841, "step": 7118 }, { "epoch": 0.5542504014403192, "grad_norm": 0.8626059870161726, "learning_rate": 1.7171671662192185e-05, "loss": 0.0674, "step": 7119 }, { "epoch": 0.5543282565325288, "grad_norm": 0.7644737759135892, "learning_rate": 1.7166730111706646e-05, "loss": 0.0785, "step": 7120 }, { "epoch": 0.5544061116247384, "grad_norm": 0.9318895531388401, "learning_rate": 1.7161788737720506e-05, "loss": 0.1223, "step": 7121 }, { "epoch": 0.5544839667169481, "grad_norm": 0.9306602941984264, "learning_rate": 1.715684754054158e-05, "loss": 0.0944, "step": 7122 }, { "epoch": 0.5545618218091577, "grad_norm": 0.80227027462095, "learning_rate": 1.715190652047768e-05, "loss": 0.1008, "step": 7123 }, { "epoch": 0.5546396769013673, "grad_norm": 0.9313556408830809, "learning_rate": 1.714696567783663e-05, "loss": 0.1096, "step": 7124 }, { "epoch": 0.554717531993577, "grad_norm": 0.8739940372724201, "learning_rate": 1.714202501292621e-05, "loss": 0.0958, "step": 7125 }, { "epoch": 0.5547953870857866, "grad_norm": 0.8840580970557966, "learning_rate": 1.713708452605419e-05, "loss": 0.0824, "step": 7126 }, { "epoch": 0.5548732421779962, "grad_norm": 0.9598773982752129, "learning_rate": 1.7132144217528353e-05, "loss": 0.1064, "step": 7127 }, { "epoch": 0.5549510972702059, "grad_norm": 0.8658273151446724, "learning_rate": 1.712720408765645e-05, "loss": 0.0949, "step": 7128 }, { "epoch": 0.5550289523624155, "grad_norm": 0.7435414034596565, "learning_rate": 1.7122264136746217e-05, "loss": 0.0538, "step": 7129 }, { "epoch": 0.555106807454625, "grad_norm": 0.8496578659451745, "learning_rate": 1.7117324365105416e-05, "loss": 0.0701, "step": 7130 }, { "epoch": 0.5551846625468347, "grad_norm": 1.0229970760380234, "learning_rate": 1.711238477304175e-05, "loss": 0.1258, "step": 7131 }, { "epoch": 0.5552625176390443, "grad_norm": 0.7174037384177164, "learning_rate": 1.7107445360862945e-05, "loss": 0.062, "step": 7132 }, { "epoch": 0.5553403727312539, "grad_norm": 0.8426708628381671, "learning_rate": 1.71025061288767e-05, "loss": 0.0913, "step": 7133 }, { "epoch": 0.5554182278234636, "grad_norm": 0.8326198783162093, "learning_rate": 1.7097567077390693e-05, "loss": 0.086, "step": 7134 }, { "epoch": 0.5554960829156732, "grad_norm": 0.8407762527311077, "learning_rate": 1.709262820671262e-05, "loss": 0.1025, "step": 7135 }, { "epoch": 0.5555739380078828, "grad_norm": 0.9297471760718913, "learning_rate": 1.7087689517150144e-05, "loss": 0.1033, "step": 7136 }, { "epoch": 0.5556517931000925, "grad_norm": 0.833720530784638, "learning_rate": 1.7082751009010924e-05, "loss": 0.1127, "step": 7137 }, { "epoch": 0.5557296481923021, "grad_norm": 0.8640752483610142, "learning_rate": 1.7077812682602602e-05, "loss": 0.1042, "step": 7138 }, { "epoch": 0.5558075032845117, "grad_norm": 0.8438319213351032, "learning_rate": 1.707287453823281e-05, "loss": 0.0871, "step": 7139 }, { "epoch": 0.5558853583767214, "grad_norm": 0.8656446324014978, "learning_rate": 1.7067936576209183e-05, "loss": 0.0884, "step": 7140 }, { "epoch": 0.555963213468931, "grad_norm": 0.8929015203907651, "learning_rate": 1.7062998796839323e-05, "loss": 0.089, "step": 7141 }, { "epoch": 0.5560410685611406, "grad_norm": 0.900667192025211, "learning_rate": 1.7058061200430833e-05, "loss": 0.1029, "step": 7142 }, { "epoch": 0.5561189236533503, "grad_norm": 0.8385751714378975, "learning_rate": 1.70531237872913e-05, "loss": 0.0854, "step": 7143 }, { "epoch": 0.5561967787455598, "grad_norm": 0.7351224265190495, "learning_rate": 1.7048186557728304e-05, "loss": 0.0719, "step": 7144 }, { "epoch": 0.5562746338377694, "grad_norm": 0.8373934632609612, "learning_rate": 1.704324951204941e-05, "loss": 0.0927, "step": 7145 }, { "epoch": 0.5563524889299791, "grad_norm": 0.8518507958717091, "learning_rate": 1.7038312650562177e-05, "loss": 0.0984, "step": 7146 }, { "epoch": 0.5564303440221887, "grad_norm": 0.7562763384572614, "learning_rate": 1.7033375973574144e-05, "loss": 0.091, "step": 7147 }, { "epoch": 0.5565081991143983, "grad_norm": 0.7499710587181179, "learning_rate": 1.7028439481392844e-05, "loss": 0.08, "step": 7148 }, { "epoch": 0.556586054206608, "grad_norm": 0.8542676313047828, "learning_rate": 1.702350317432579e-05, "loss": 0.1359, "step": 7149 }, { "epoch": 0.5566639092988176, "grad_norm": 0.8211855033001242, "learning_rate": 1.7018567052680502e-05, "loss": 0.0777, "step": 7150 }, { "epoch": 0.5566639092988176, "eval_loss": 0.011882110498845577, "eval_runtime": 162.5838, "eval_samples_per_second": 17.714, "eval_steps_per_second": 0.634, "step": 7150 }, { "epoch": 0.5567417643910272, "grad_norm": 0.8571465559483435, "learning_rate": 1.701363111676447e-05, "loss": 0.0935, "step": 7151 }, { "epoch": 0.5568196194832368, "grad_norm": 0.821665094018911, "learning_rate": 1.700869536688519e-05, "loss": 0.076, "step": 7152 }, { "epoch": 0.5568974745754465, "grad_norm": 0.7717663212998284, "learning_rate": 1.7003759803350122e-05, "loss": 0.0846, "step": 7153 }, { "epoch": 0.5569753296676561, "grad_norm": 0.8371824064792368, "learning_rate": 1.6998824426466733e-05, "loss": 0.0913, "step": 7154 }, { "epoch": 0.5570531847598656, "grad_norm": 0.814048846955777, "learning_rate": 1.6993889236542482e-05, "loss": 0.1058, "step": 7155 }, { "epoch": 0.5571310398520753, "grad_norm": 0.7929739243068147, "learning_rate": 1.6988954233884798e-05, "loss": 0.0817, "step": 7156 }, { "epoch": 0.5572088949442849, "grad_norm": 0.9370873962802324, "learning_rate": 1.6984019418801113e-05, "loss": 0.1079, "step": 7157 }, { "epoch": 0.5572867500364945, "grad_norm": 0.7955262448942524, "learning_rate": 1.697908479159884e-05, "loss": 0.0808, "step": 7158 }, { "epoch": 0.5573646051287042, "grad_norm": 0.8795205980801641, "learning_rate": 1.6974150352585383e-05, "loss": 0.0867, "step": 7159 }, { "epoch": 0.5574424602209138, "grad_norm": 0.848892390016008, "learning_rate": 1.696921610206814e-05, "loss": 0.0937, "step": 7160 }, { "epoch": 0.5575203153131234, "grad_norm": 0.7658332482944301, "learning_rate": 1.6964282040354486e-05, "loss": 0.0878, "step": 7161 }, { "epoch": 0.5575981704053331, "grad_norm": 0.7706485376835855, "learning_rate": 1.6959348167751795e-05, "loss": 0.0847, "step": 7162 }, { "epoch": 0.5576760254975427, "grad_norm": 0.9347050618091357, "learning_rate": 1.695441448456742e-05, "loss": 0.1107, "step": 7163 }, { "epoch": 0.5577538805897523, "grad_norm": 0.7811380135667316, "learning_rate": 1.69494809911087e-05, "loss": 0.0796, "step": 7164 }, { "epoch": 0.557831735681962, "grad_norm": 0.8582171943506935, "learning_rate": 1.6944547687682986e-05, "loss": 0.1101, "step": 7165 }, { "epoch": 0.5579095907741716, "grad_norm": 0.8055028239427804, "learning_rate": 1.693961457459758e-05, "loss": 0.0733, "step": 7166 }, { "epoch": 0.5579874458663812, "grad_norm": 0.8586206260820178, "learning_rate": 1.6934681652159808e-05, "loss": 0.0836, "step": 7167 }, { "epoch": 0.5580653009585909, "grad_norm": 0.7574538941636256, "learning_rate": 1.692974892067696e-05, "loss": 0.0804, "step": 7168 }, { "epoch": 0.5581431560508004, "grad_norm": 0.7061380245755378, "learning_rate": 1.6924816380456325e-05, "loss": 0.0703, "step": 7169 }, { "epoch": 0.55822101114301, "grad_norm": 0.7798652006243237, "learning_rate": 1.6919884031805166e-05, "loss": 0.0789, "step": 7170 }, { "epoch": 0.5582988662352197, "grad_norm": 0.8963891332716343, "learning_rate": 1.6914951875030756e-05, "loss": 0.105, "step": 7171 }, { "epoch": 0.5583767213274293, "grad_norm": 0.818365764238917, "learning_rate": 1.691001991044035e-05, "loss": 0.1067, "step": 7172 }, { "epoch": 0.5584545764196389, "grad_norm": 0.7785846053772135, "learning_rate": 1.6905088138341174e-05, "loss": 0.0777, "step": 7173 }, { "epoch": 0.5585324315118486, "grad_norm": 0.8971964720117614, "learning_rate": 1.6900156559040465e-05, "loss": 0.0752, "step": 7174 }, { "epoch": 0.5586102866040582, "grad_norm": 0.8815431616119482, "learning_rate": 1.6895225172845423e-05, "loss": 0.1092, "step": 7175 }, { "epoch": 0.5586881416962678, "grad_norm": 0.8610363260388004, "learning_rate": 1.6890293980063264e-05, "loss": 0.1197, "step": 7176 }, { "epoch": 0.5587659967884775, "grad_norm": 0.8775564380516777, "learning_rate": 1.688536298100117e-05, "loss": 0.1033, "step": 7177 }, { "epoch": 0.5588438518806871, "grad_norm": 0.8292183036929921, "learning_rate": 1.6880432175966334e-05, "loss": 0.0943, "step": 7178 }, { "epoch": 0.5589217069728967, "grad_norm": 0.8616964238739697, "learning_rate": 1.6875501565265903e-05, "loss": 0.1202, "step": 7179 }, { "epoch": 0.5589995620651064, "grad_norm": 0.8605419111917607, "learning_rate": 1.6870571149207027e-05, "loss": 0.103, "step": 7180 }, { "epoch": 0.559077417157316, "grad_norm": 0.8172355548099748, "learning_rate": 1.686564092809687e-05, "loss": 0.0697, "step": 7181 }, { "epoch": 0.5591552722495255, "grad_norm": 0.9009346605650407, "learning_rate": 1.6860710902242548e-05, "loss": 0.1211, "step": 7182 }, { "epoch": 0.5592331273417352, "grad_norm": 0.8735088506447233, "learning_rate": 1.6855781071951187e-05, "loss": 0.1054, "step": 7183 }, { "epoch": 0.5593109824339448, "grad_norm": 0.8109433781909003, "learning_rate": 1.685085143752988e-05, "loss": 0.0897, "step": 7184 }, { "epoch": 0.5593888375261544, "grad_norm": 0.7575500234334163, "learning_rate": 1.684592199928572e-05, "loss": 0.087, "step": 7185 }, { "epoch": 0.5594666926183641, "grad_norm": 0.7354260051196718, "learning_rate": 1.6840992757525802e-05, "loss": 0.0662, "step": 7186 }, { "epoch": 0.5595445477105737, "grad_norm": 0.8077040065523875, "learning_rate": 1.683606371255719e-05, "loss": 0.082, "step": 7187 }, { "epoch": 0.5596224028027833, "grad_norm": 0.7707050707916265, "learning_rate": 1.6831134864686932e-05, "loss": 0.0904, "step": 7188 }, { "epoch": 0.5597002578949929, "grad_norm": 0.8201052516434215, "learning_rate": 1.6826206214222077e-05, "loss": 0.0869, "step": 7189 }, { "epoch": 0.5597781129872026, "grad_norm": 0.7728425913914939, "learning_rate": 1.682127776146965e-05, "loss": 0.0642, "step": 7190 }, { "epoch": 0.5598559680794122, "grad_norm": 0.7907881054945394, "learning_rate": 1.6816349506736682e-05, "loss": 0.1019, "step": 7191 }, { "epoch": 0.5599338231716218, "grad_norm": 0.8004046128333991, "learning_rate": 1.6811421450330183e-05, "loss": 0.0917, "step": 7192 }, { "epoch": 0.5600116782638315, "grad_norm": 0.8509218233557033, "learning_rate": 1.6806493592557135e-05, "loss": 0.0782, "step": 7193 }, { "epoch": 0.5600895333560411, "grad_norm": 0.8276981041882865, "learning_rate": 1.6801565933724526e-05, "loss": 0.0877, "step": 7194 }, { "epoch": 0.5601673884482506, "grad_norm": 0.8079193932119718, "learning_rate": 1.6796638474139315e-05, "loss": 0.0786, "step": 7195 }, { "epoch": 0.5602452435404603, "grad_norm": 0.786460554733452, "learning_rate": 1.679171121410848e-05, "loss": 0.0868, "step": 7196 }, { "epoch": 0.5603230986326699, "grad_norm": 0.9674557489712493, "learning_rate": 1.678678415393896e-05, "loss": 0.1153, "step": 7197 }, { "epoch": 0.5604009537248795, "grad_norm": 0.7165837213014001, "learning_rate": 1.678185729393768e-05, "loss": 0.0799, "step": 7198 }, { "epoch": 0.5604788088170892, "grad_norm": 0.8588663100773729, "learning_rate": 1.6776930634411564e-05, "loss": 0.1188, "step": 7199 }, { "epoch": 0.5605566639092988, "grad_norm": 0.7689969033563732, "learning_rate": 1.677200417566751e-05, "loss": 0.0818, "step": 7200 }, { "epoch": 0.5605566639092988, "eval_loss": 0.011579836718738079, "eval_runtime": 162.7906, "eval_samples_per_second": 17.691, "eval_steps_per_second": 0.633, "step": 7200 }, { "epoch": 0.5606345190015084, "grad_norm": 0.813847592856234, "learning_rate": 1.6767077918012436e-05, "loss": 0.1002, "step": 7201 }, { "epoch": 0.5607123740937181, "grad_norm": 0.7412610574749596, "learning_rate": 1.6762151861753207e-05, "loss": 0.0844, "step": 7202 }, { "epoch": 0.5607902291859277, "grad_norm": 0.8819694646090411, "learning_rate": 1.6757226007196697e-05, "loss": 0.1183, "step": 7203 }, { "epoch": 0.5608680842781373, "grad_norm": 0.7861894443469948, "learning_rate": 1.6752300354649767e-05, "loss": 0.0672, "step": 7204 }, { "epoch": 0.560945939370347, "grad_norm": 0.8403522752826758, "learning_rate": 1.6747374904419257e-05, "loss": 0.1136, "step": 7205 }, { "epoch": 0.5610237944625566, "grad_norm": 0.8098014311183179, "learning_rate": 1.6742449656812e-05, "loss": 0.0812, "step": 7206 }, { "epoch": 0.5611016495547662, "grad_norm": 0.8671820859560861, "learning_rate": 1.673752461213482e-05, "loss": 0.0915, "step": 7207 }, { "epoch": 0.5611795046469759, "grad_norm": 0.8285214470615716, "learning_rate": 1.6732599770694523e-05, "loss": 0.0798, "step": 7208 }, { "epoch": 0.5612573597391854, "grad_norm": 0.7733989549396908, "learning_rate": 1.67276751327979e-05, "loss": 0.103, "step": 7209 }, { "epoch": 0.561335214831395, "grad_norm": 0.8946556664264216, "learning_rate": 1.6722750698751737e-05, "loss": 0.1223, "step": 7210 }, { "epoch": 0.5614130699236047, "grad_norm": 0.7558243778337214, "learning_rate": 1.6717826468862798e-05, "loss": 0.0813, "step": 7211 }, { "epoch": 0.5614909250158143, "grad_norm": 0.7665323516341576, "learning_rate": 1.671290244343784e-05, "loss": 0.1016, "step": 7212 }, { "epoch": 0.5615687801080239, "grad_norm": 0.8278395601958815, "learning_rate": 1.6707978622783617e-05, "loss": 0.1111, "step": 7213 }, { "epoch": 0.5616466352002336, "grad_norm": 0.7716804750211342, "learning_rate": 1.6703055007206847e-05, "loss": 0.0884, "step": 7214 }, { "epoch": 0.5617244902924432, "grad_norm": 0.8735343343907688, "learning_rate": 1.6698131597014256e-05, "loss": 0.0746, "step": 7215 }, { "epoch": 0.5618023453846528, "grad_norm": 0.8671203155677094, "learning_rate": 1.669320839251254e-05, "loss": 0.1153, "step": 7216 }, { "epoch": 0.5618802004768625, "grad_norm": 0.866830206675363, "learning_rate": 1.6688285394008404e-05, "loss": 0.0979, "step": 7217 }, { "epoch": 0.5619580555690721, "grad_norm": 0.8209081003570916, "learning_rate": 1.668336260180852e-05, "loss": 0.093, "step": 7218 }, { "epoch": 0.5620359106612817, "grad_norm": 0.8200034590890919, "learning_rate": 1.6678440016219558e-05, "loss": 0.0625, "step": 7219 }, { "epoch": 0.5621137657534914, "grad_norm": 0.7687635972765914, "learning_rate": 1.6673517637548166e-05, "loss": 0.0817, "step": 7220 }, { "epoch": 0.562191620845701, "grad_norm": 0.855034612431575, "learning_rate": 1.666859546610099e-05, "loss": 0.1124, "step": 7221 }, { "epoch": 0.5622694759379105, "grad_norm": 0.8340834889307344, "learning_rate": 1.666367350218466e-05, "loss": 0.0907, "step": 7222 }, { "epoch": 0.5623473310301201, "grad_norm": 0.8589713412246692, "learning_rate": 1.6658751746105787e-05, "loss": 0.1041, "step": 7223 }, { "epoch": 0.5624251861223298, "grad_norm": 0.8259364220314912, "learning_rate": 1.6653830198170976e-05, "loss": 0.0888, "step": 7224 }, { "epoch": 0.5625030412145394, "grad_norm": 0.8329687761046797, "learning_rate": 1.6648908858686817e-05, "loss": 0.0717, "step": 7225 }, { "epoch": 0.562580896306749, "grad_norm": 0.8167755045462272, "learning_rate": 1.6643987727959877e-05, "loss": 0.0937, "step": 7226 }, { "epoch": 0.5626587513989587, "grad_norm": 0.7724739131832058, "learning_rate": 1.6639066806296736e-05, "loss": 0.0943, "step": 7227 }, { "epoch": 0.5627366064911683, "grad_norm": 0.9314443852369406, "learning_rate": 1.663414609400393e-05, "loss": 0.1125, "step": 7228 }, { "epoch": 0.5628144615833779, "grad_norm": 0.8725016080661862, "learning_rate": 1.6629225591388004e-05, "loss": 0.09, "step": 7229 }, { "epoch": 0.5628923166755876, "grad_norm": 0.782207786165553, "learning_rate": 1.662430529875548e-05, "loss": 0.1261, "step": 7230 }, { "epoch": 0.5629701717677972, "grad_norm": 0.7506226657228804, "learning_rate": 1.6619385216412864e-05, "loss": 0.0736, "step": 7231 }, { "epoch": 0.5630480268600068, "grad_norm": 0.9079237797556443, "learning_rate": 1.6614465344666664e-05, "loss": 0.0991, "step": 7232 }, { "epoch": 0.5631258819522165, "grad_norm": 0.8506863098041711, "learning_rate": 1.660954568382336e-05, "loss": 0.0991, "step": 7233 }, { "epoch": 0.5632037370444261, "grad_norm": 0.8794065163560102, "learning_rate": 1.660462623418942e-05, "loss": 0.0876, "step": 7234 }, { "epoch": 0.5632815921366356, "grad_norm": 0.7583173459420028, "learning_rate": 1.6599706996071308e-05, "loss": 0.0683, "step": 7235 }, { "epoch": 0.5633594472288453, "grad_norm": 0.844372193060926, "learning_rate": 1.6594787969775462e-05, "loss": 0.0869, "step": 7236 }, { "epoch": 0.5634373023210549, "grad_norm": 0.72478068820934, "learning_rate": 1.6589869155608325e-05, "loss": 0.0855, "step": 7237 }, { "epoch": 0.5635151574132645, "grad_norm": 0.8082765394450545, "learning_rate": 1.658495055387631e-05, "loss": 0.0958, "step": 7238 }, { "epoch": 0.5635930125054742, "grad_norm": 0.7890187548040654, "learning_rate": 1.6580032164885824e-05, "loss": 0.0939, "step": 7239 }, { "epoch": 0.5636708675976838, "grad_norm": 0.7705434376316723, "learning_rate": 1.6575113988943257e-05, "loss": 0.0921, "step": 7240 }, { "epoch": 0.5637487226898934, "grad_norm": 0.7412235152445864, "learning_rate": 1.6570196026354988e-05, "loss": 0.0826, "step": 7241 }, { "epoch": 0.5638265777821031, "grad_norm": 0.8221787257117805, "learning_rate": 1.656527827742739e-05, "loss": 0.1027, "step": 7242 }, { "epoch": 0.5639044328743127, "grad_norm": 0.8409586194072497, "learning_rate": 1.6560360742466807e-05, "loss": 0.0802, "step": 7243 }, { "epoch": 0.5639822879665223, "grad_norm": 0.7912963248536958, "learning_rate": 1.6555443421779583e-05, "loss": 0.081, "step": 7244 }, { "epoch": 0.564060143058732, "grad_norm": 0.76226213150656, "learning_rate": 1.655052631567205e-05, "loss": 0.0888, "step": 7245 }, { "epoch": 0.5641379981509416, "grad_norm": 0.8007965970863821, "learning_rate": 1.654560942445051e-05, "loss": 0.0877, "step": 7246 }, { "epoch": 0.5642158532431512, "grad_norm": 0.7543762278730634, "learning_rate": 1.6540692748421253e-05, "loss": 0.0784, "step": 7247 }, { "epoch": 0.5642937083353609, "grad_norm": 0.7136834705939058, "learning_rate": 1.653577628789059e-05, "loss": 0.0606, "step": 7248 }, { "epoch": 0.5643715634275704, "grad_norm": 0.793194521442243, "learning_rate": 1.653086004316478e-05, "loss": 0.0733, "step": 7249 }, { "epoch": 0.56444941851978, "grad_norm": 0.8616305127199684, "learning_rate": 1.6525944014550085e-05, "loss": 0.0882, "step": 7250 }, { "epoch": 0.56444941851978, "eval_loss": 0.011412674561142921, "eval_runtime": 162.5026, "eval_samples_per_second": 17.723, "eval_steps_per_second": 0.634, "step": 7250 }, { "epoch": 0.5645272736119897, "grad_norm": 0.765866363535925, "learning_rate": 1.6521028202352746e-05, "loss": 0.0806, "step": 7251 }, { "epoch": 0.5646051287041993, "grad_norm": 0.8493525934005827, "learning_rate": 1.651611260687899e-05, "loss": 0.0886, "step": 7252 }, { "epoch": 0.5646829837964089, "grad_norm": 0.889623946549855, "learning_rate": 1.651119722843505e-05, "loss": 0.0871, "step": 7253 }, { "epoch": 0.5647608388886186, "grad_norm": 0.7970492739574851, "learning_rate": 1.6506282067327128e-05, "loss": 0.0834, "step": 7254 }, { "epoch": 0.5648386939808282, "grad_norm": 0.7876602342098944, "learning_rate": 1.6501367123861405e-05, "loss": 0.0718, "step": 7255 }, { "epoch": 0.5649165490730378, "grad_norm": 0.8127444753239115, "learning_rate": 1.6496452398344066e-05, "loss": 0.0727, "step": 7256 }, { "epoch": 0.5649944041652475, "grad_norm": 0.841679707457626, "learning_rate": 1.6491537891081267e-05, "loss": 0.0999, "step": 7257 }, { "epoch": 0.5650722592574571, "grad_norm": 0.7956566440354591, "learning_rate": 1.6486623602379178e-05, "loss": 0.1045, "step": 7258 }, { "epoch": 0.5651501143496667, "grad_norm": 0.893480739328161, "learning_rate": 1.6481709532543925e-05, "loss": 0.1521, "step": 7259 }, { "epoch": 0.5652279694418763, "grad_norm": 0.7949308989564358, "learning_rate": 1.6476795681881625e-05, "loss": 0.1076, "step": 7260 }, { "epoch": 0.565305824534086, "grad_norm": 0.7656274788231117, "learning_rate": 1.6471882050698396e-05, "loss": 0.0641, "step": 7261 }, { "epoch": 0.5653836796262955, "grad_norm": 0.7716623045968953, "learning_rate": 1.6466968639300322e-05, "loss": 0.0776, "step": 7262 }, { "epoch": 0.5654615347185051, "grad_norm": 0.8229167579562925, "learning_rate": 1.6462055447993506e-05, "loss": 0.0826, "step": 7263 }, { "epoch": 0.5655393898107148, "grad_norm": 0.8166793357027229, "learning_rate": 1.6457142477084006e-05, "loss": 0.0872, "step": 7264 }, { "epoch": 0.5656172449029244, "grad_norm": 0.8057669321772579, "learning_rate": 1.6452229726877872e-05, "loss": 0.0874, "step": 7265 }, { "epoch": 0.565695099995134, "grad_norm": 0.8374644641032121, "learning_rate": 1.6447317197681153e-05, "loss": 0.1128, "step": 7266 }, { "epoch": 0.5657729550873437, "grad_norm": 0.7771151434413497, "learning_rate": 1.6442404889799866e-05, "loss": 0.0763, "step": 7267 }, { "epoch": 0.5658508101795533, "grad_norm": 0.7512911656776774, "learning_rate": 1.6437492803540043e-05, "loss": 0.0796, "step": 7268 }, { "epoch": 0.5659286652717629, "grad_norm": 0.855031187852307, "learning_rate": 1.643258093920767e-05, "loss": 0.087, "step": 7269 }, { "epoch": 0.5660065203639726, "grad_norm": 0.7897121116659838, "learning_rate": 1.6427669297108736e-05, "loss": 0.098, "step": 7270 }, { "epoch": 0.5660843754561822, "grad_norm": 0.7493957116448738, "learning_rate": 1.642275787754921e-05, "loss": 0.0579, "step": 7271 }, { "epoch": 0.5661622305483918, "grad_norm": 0.7389040985362773, "learning_rate": 1.6417846680835054e-05, "loss": 0.0745, "step": 7272 }, { "epoch": 0.5662400856406015, "grad_norm": 0.8409015277300427, "learning_rate": 1.6412935707272212e-05, "loss": 0.0813, "step": 7273 }, { "epoch": 0.566317940732811, "grad_norm": 0.8333502239085506, "learning_rate": 1.640802495716662e-05, "loss": 0.1057, "step": 7274 }, { "epoch": 0.5663957958250206, "grad_norm": 0.8436532895917567, "learning_rate": 1.6403114430824187e-05, "loss": 0.0908, "step": 7275 }, { "epoch": 0.5664736509172303, "grad_norm": 0.812420118940963, "learning_rate": 1.6398204128550815e-05, "loss": 0.1011, "step": 7276 }, { "epoch": 0.5665515060094399, "grad_norm": 0.890546067857565, "learning_rate": 1.6393294050652395e-05, "loss": 0.0948, "step": 7277 }, { "epoch": 0.5666293611016495, "grad_norm": 0.6900088323852934, "learning_rate": 1.6388384197434808e-05, "loss": 0.0584, "step": 7278 }, { "epoch": 0.5667072161938592, "grad_norm": 0.8485003478777814, "learning_rate": 1.6383474569203907e-05, "loss": 0.0926, "step": 7279 }, { "epoch": 0.5667850712860688, "grad_norm": 0.7499491844714317, "learning_rate": 1.637856516626554e-05, "loss": 0.0807, "step": 7280 }, { "epoch": 0.5668629263782784, "grad_norm": 0.810592530000993, "learning_rate": 1.6373655988925544e-05, "loss": 0.1165, "step": 7281 }, { "epoch": 0.5669407814704881, "grad_norm": 0.7366258008789778, "learning_rate": 1.6368747037489732e-05, "loss": 0.0786, "step": 7282 }, { "epoch": 0.5670186365626977, "grad_norm": 0.815926216487345, "learning_rate": 1.636383831226391e-05, "loss": 0.0872, "step": 7283 }, { "epoch": 0.5670964916549073, "grad_norm": 0.7993152383140361, "learning_rate": 1.635892981355387e-05, "loss": 0.0739, "step": 7284 }, { "epoch": 0.567174346747117, "grad_norm": 0.7803252096197029, "learning_rate": 1.6354021541665393e-05, "loss": 0.0882, "step": 7285 }, { "epoch": 0.5672522018393266, "grad_norm": 0.7413219578277467, "learning_rate": 1.6349113496904233e-05, "loss": 0.068, "step": 7286 }, { "epoch": 0.5673300569315362, "grad_norm": 0.7686168311036206, "learning_rate": 1.6344205679576142e-05, "loss": 0.0983, "step": 7287 }, { "epoch": 0.5674079120237459, "grad_norm": 0.7561305659276797, "learning_rate": 1.6339298089986848e-05, "loss": 0.0911, "step": 7288 }, { "epoch": 0.5674857671159554, "grad_norm": 0.8362469329640264, "learning_rate": 1.6334390728442084e-05, "loss": 0.0857, "step": 7289 }, { "epoch": 0.567563622208165, "grad_norm": 0.783540867158515, "learning_rate": 1.6329483595247545e-05, "loss": 0.0928, "step": 7290 }, { "epoch": 0.5676414773003747, "grad_norm": 0.8529420672331323, "learning_rate": 1.6324576690708925e-05, "loss": 0.0829, "step": 7291 }, { "epoch": 0.5677193323925843, "grad_norm": 0.7837509850847953, "learning_rate": 1.63196700151319e-05, "loss": 0.0984, "step": 7292 }, { "epoch": 0.5677971874847939, "grad_norm": 0.7925983703630732, "learning_rate": 1.6314763568822134e-05, "loss": 0.094, "step": 7293 }, { "epoch": 0.5678750425770035, "grad_norm": 0.8318970224567402, "learning_rate": 1.6309857352085274e-05, "loss": 0.0743, "step": 7294 }, { "epoch": 0.5679528976692132, "grad_norm": 0.7105070575860034, "learning_rate": 1.630495136522696e-05, "loss": 0.069, "step": 7295 }, { "epoch": 0.5680307527614228, "grad_norm": 0.7077718033988774, "learning_rate": 1.6300045608552806e-05, "loss": 0.0634, "step": 7296 }, { "epoch": 0.5681086078536324, "grad_norm": 0.8092253005187835, "learning_rate": 1.629514008236842e-05, "loss": 0.106, "step": 7297 }, { "epoch": 0.5681864629458421, "grad_norm": 0.7887858674796305, "learning_rate": 1.6290234786979387e-05, "loss": 0.1094, "step": 7298 }, { "epoch": 0.5682643180380517, "grad_norm": 0.8913486143651118, "learning_rate": 1.6285329722691296e-05, "loss": 0.1202, "step": 7299 }, { "epoch": 0.5683421731302613, "grad_norm": 0.831805185576197, "learning_rate": 1.6280424889809703e-05, "loss": 0.1178, "step": 7300 }, { "epoch": 0.5683421731302613, "eval_loss": 0.011141207069158554, "eval_runtime": 162.5352, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.634, "step": 7300 }, { "epoch": 0.568420028222471, "grad_norm": 0.6678672644242615, "learning_rate": 1.6275520288640153e-05, "loss": 0.0687, "step": 7301 }, { "epoch": 0.5684978833146805, "grad_norm": 0.7742667413994555, "learning_rate": 1.6270615919488184e-05, "loss": 0.0738, "step": 7302 }, { "epoch": 0.5685757384068901, "grad_norm": 0.8888173151266485, "learning_rate": 1.6265711782659306e-05, "loss": 0.1114, "step": 7303 }, { "epoch": 0.5686535934990998, "grad_norm": 0.7730552509145107, "learning_rate": 1.6260807878459043e-05, "loss": 0.0859, "step": 7304 }, { "epoch": 0.5687314485913094, "grad_norm": 0.8103306835468198, "learning_rate": 1.6255904207192867e-05, "loss": 0.0869, "step": 7305 }, { "epoch": 0.568809303683519, "grad_norm": 0.8173805261024163, "learning_rate": 1.6251000769166263e-05, "loss": 0.0936, "step": 7306 }, { "epoch": 0.5688871587757287, "grad_norm": 0.7416627945196129, "learning_rate": 1.624609756468469e-05, "loss": 0.0727, "step": 7307 }, { "epoch": 0.5689650138679383, "grad_norm": 0.7599917973434863, "learning_rate": 1.6241194594053584e-05, "loss": 0.0842, "step": 7308 }, { "epoch": 0.5690428689601479, "grad_norm": 0.732918718901647, "learning_rate": 1.6236291857578397e-05, "loss": 0.0732, "step": 7309 }, { "epoch": 0.5691207240523576, "grad_norm": 0.7571368128362466, "learning_rate": 1.6231389355564535e-05, "loss": 0.0814, "step": 7310 }, { "epoch": 0.5691985791445672, "grad_norm": 0.7779037111730399, "learning_rate": 1.62264870883174e-05, "loss": 0.0834, "step": 7311 }, { "epoch": 0.5692764342367768, "grad_norm": 0.7407186890926658, "learning_rate": 1.6221585056142387e-05, "loss": 0.068, "step": 7312 }, { "epoch": 0.5693542893289865, "grad_norm": 0.795407098142597, "learning_rate": 1.6216683259344853e-05, "loss": 0.0815, "step": 7313 }, { "epoch": 0.569432144421196, "grad_norm": 0.847886137798467, "learning_rate": 1.6211781698230174e-05, "loss": 0.0903, "step": 7314 }, { "epoch": 0.5695099995134056, "grad_norm": 0.750367355166747, "learning_rate": 1.6206880373103696e-05, "loss": 0.081, "step": 7315 }, { "epoch": 0.5695878546056153, "grad_norm": 0.7704943146206682, "learning_rate": 1.6201979284270743e-05, "loss": 0.0769, "step": 7316 }, { "epoch": 0.5696657096978249, "grad_norm": 0.7939745907142195, "learning_rate": 1.619707843203662e-05, "loss": 0.0846, "step": 7317 }, { "epoch": 0.5697435647900345, "grad_norm": 0.7321356898793971, "learning_rate": 1.619217781670663e-05, "loss": 0.0769, "step": 7318 }, { "epoch": 0.5698214198822442, "grad_norm": 0.7906322713478824, "learning_rate": 1.6187277438586074e-05, "loss": 0.0767, "step": 7319 }, { "epoch": 0.5698992749744538, "grad_norm": 0.7520547868204537, "learning_rate": 1.618237729798021e-05, "loss": 0.0686, "step": 7320 }, { "epoch": 0.5699771300666634, "grad_norm": 0.8440901416510825, "learning_rate": 1.61774773951943e-05, "loss": 0.0919, "step": 7321 }, { "epoch": 0.5700549851588731, "grad_norm": 0.7858395873655474, "learning_rate": 1.617257773053358e-05, "loss": 0.1164, "step": 7322 }, { "epoch": 0.5701328402510827, "grad_norm": 0.8202290503305226, "learning_rate": 1.6167678304303272e-05, "loss": 0.0886, "step": 7323 }, { "epoch": 0.5702106953432923, "grad_norm": 0.8936965884918027, "learning_rate": 1.6162779116808584e-05, "loss": 0.1105, "step": 7324 }, { "epoch": 0.570288550435502, "grad_norm": 0.8151564902644449, "learning_rate": 1.6157880168354732e-05, "loss": 0.1104, "step": 7325 }, { "epoch": 0.5703664055277116, "grad_norm": 0.7056025152734302, "learning_rate": 1.615298145924689e-05, "loss": 0.07, "step": 7326 }, { "epoch": 0.5704442606199212, "grad_norm": 0.7343404417778833, "learning_rate": 1.6148082989790213e-05, "loss": 0.0733, "step": 7327 }, { "epoch": 0.5705221157121309, "grad_norm": 0.7276203387912453, "learning_rate": 1.6143184760289863e-05, "loss": 0.0797, "step": 7328 }, { "epoch": 0.5705999708043404, "grad_norm": 0.7153943262475435, "learning_rate": 1.6138286771050964e-05, "loss": 0.091, "step": 7329 }, { "epoch": 0.57067782589655, "grad_norm": 0.8245033284835076, "learning_rate": 1.613338902237866e-05, "loss": 0.0853, "step": 7330 }, { "epoch": 0.5707556809887596, "grad_norm": 0.7937738861473537, "learning_rate": 1.6128491514578043e-05, "loss": 0.0742, "step": 7331 }, { "epoch": 0.5708335360809693, "grad_norm": 0.8706023428451805, "learning_rate": 1.6123594247954204e-05, "loss": 0.0973, "step": 7332 }, { "epoch": 0.5709113911731789, "grad_norm": 0.7133085623111352, "learning_rate": 1.6118697222812224e-05, "loss": 0.0722, "step": 7333 }, { "epoch": 0.5709892462653885, "grad_norm": 0.7930853826579206, "learning_rate": 1.6113800439457157e-05, "loss": 0.0814, "step": 7334 }, { "epoch": 0.5710671013575982, "grad_norm": 0.7765039838998382, "learning_rate": 1.6108903898194065e-05, "loss": 0.092, "step": 7335 }, { "epoch": 0.5711449564498078, "grad_norm": 0.9926140056543965, "learning_rate": 1.610400759932797e-05, "loss": 0.115, "step": 7336 }, { "epoch": 0.5712228115420174, "grad_norm": 0.7394896480961153, "learning_rate": 1.6099111543163885e-05, "loss": 0.1022, "step": 7337 }, { "epoch": 0.5713006666342271, "grad_norm": 0.7977489615007782, "learning_rate": 1.6094215730006818e-05, "loss": 0.0842, "step": 7338 }, { "epoch": 0.5713785217264367, "grad_norm": 0.7006670328573719, "learning_rate": 1.6089320160161747e-05, "loss": 0.0791, "step": 7339 }, { "epoch": 0.5714563768186463, "grad_norm": 0.8304261263052239, "learning_rate": 1.608442483393365e-05, "loss": 0.1199, "step": 7340 }, { "epoch": 0.571534231910856, "grad_norm": 0.8142588619030499, "learning_rate": 1.6079529751627486e-05, "loss": 0.0991, "step": 7341 }, { "epoch": 0.5716120870030655, "grad_norm": 0.7050611892203277, "learning_rate": 1.6074634913548185e-05, "loss": 0.06, "step": 7342 }, { "epoch": 0.5716899420952751, "grad_norm": 0.7835261601893667, "learning_rate": 1.606974032000068e-05, "loss": 0.0647, "step": 7343 }, { "epoch": 0.5717677971874848, "grad_norm": 0.7032308916522815, "learning_rate": 1.6064845971289877e-05, "loss": 0.0589, "step": 7344 }, { "epoch": 0.5718456522796944, "grad_norm": 0.8044049559547676, "learning_rate": 1.6059951867720678e-05, "loss": 0.0676, "step": 7345 }, { "epoch": 0.571923507371904, "grad_norm": 0.7838725430709077, "learning_rate": 1.6055058009597954e-05, "loss": 0.0843, "step": 7346 }, { "epoch": 0.5720013624641137, "grad_norm": 0.7632591847327318, "learning_rate": 1.6050164397226576e-05, "loss": 0.0783, "step": 7347 }, { "epoch": 0.5720792175563233, "grad_norm": 0.7663443352350753, "learning_rate": 1.6045271030911388e-05, "loss": 0.0863, "step": 7348 }, { "epoch": 0.5721570726485329, "grad_norm": 0.8243297648486017, "learning_rate": 1.6040377910957223e-05, "loss": 0.1083, "step": 7349 }, { "epoch": 0.5722349277407426, "grad_norm": 0.7809519946567682, "learning_rate": 1.603548503766891e-05, "loss": 0.0855, "step": 7350 }, { "epoch": 0.5722349277407426, "eval_loss": 0.010996539145708084, "eval_runtime": 162.2899, "eval_samples_per_second": 17.746, "eval_steps_per_second": 0.635, "step": 7350 }, { "epoch": 0.5723127828329522, "grad_norm": 0.7212459192580514, "learning_rate": 1.6030592411351237e-05, "loss": 0.0566, "step": 7351 }, { "epoch": 0.5723906379251618, "grad_norm": 0.8037632267797101, "learning_rate": 1.6025700032309003e-05, "loss": 0.088, "step": 7352 }, { "epoch": 0.5724684930173715, "grad_norm": 0.8431959938967576, "learning_rate": 1.6020807900846976e-05, "loss": 0.0898, "step": 7353 }, { "epoch": 0.572546348109581, "grad_norm": 0.7576008516783499, "learning_rate": 1.6015916017269908e-05, "loss": 0.0674, "step": 7354 }, { "epoch": 0.5726242032017906, "grad_norm": 0.7272711896870675, "learning_rate": 1.6011024381882553e-05, "loss": 0.068, "step": 7355 }, { "epoch": 0.5727020582940003, "grad_norm": 0.7523942401424737, "learning_rate": 1.6006132994989623e-05, "loss": 0.0922, "step": 7356 }, { "epoch": 0.5727799133862099, "grad_norm": 0.7387560376124647, "learning_rate": 1.600124185689584e-05, "loss": 0.0986, "step": 7357 }, { "epoch": 0.5728577684784195, "grad_norm": 0.6917315532167142, "learning_rate": 1.599635096790589e-05, "loss": 0.0824, "step": 7358 }, { "epoch": 0.5729356235706292, "grad_norm": 0.8167783223198497, "learning_rate": 1.5991460328324456e-05, "loss": 0.1138, "step": 7359 }, { "epoch": 0.5730134786628388, "grad_norm": 0.7130628378633654, "learning_rate": 1.59865699384562e-05, "loss": 0.0641, "step": 7360 }, { "epoch": 0.5730913337550484, "grad_norm": 0.8841052588086565, "learning_rate": 1.5981679798605772e-05, "loss": 0.1203, "step": 7361 }, { "epoch": 0.5731691888472581, "grad_norm": 0.8849423924481754, "learning_rate": 1.597678990907781e-05, "loss": 0.0969, "step": 7362 }, { "epoch": 0.5732470439394677, "grad_norm": 0.7718136175881697, "learning_rate": 1.5971900270176922e-05, "loss": 0.0923, "step": 7363 }, { "epoch": 0.5733248990316773, "grad_norm": 0.8407296938268026, "learning_rate": 1.5967010882207717e-05, "loss": 0.086, "step": 7364 }, { "epoch": 0.573402754123887, "grad_norm": 0.7374458568594239, "learning_rate": 1.596212174547477e-05, "loss": 0.0659, "step": 7365 }, { "epoch": 0.5734806092160966, "grad_norm": 0.6904352098838286, "learning_rate": 1.5957232860282665e-05, "loss": 0.0928, "step": 7366 }, { "epoch": 0.5735584643083061, "grad_norm": 0.783401381858016, "learning_rate": 1.5952344226935946e-05, "loss": 0.0758, "step": 7367 }, { "epoch": 0.5736363194005157, "grad_norm": 0.8248116792015988, "learning_rate": 1.594745584573916e-05, "loss": 0.0827, "step": 7368 }, { "epoch": 0.5737141744927254, "grad_norm": 0.8780001815450663, "learning_rate": 1.5942567716996827e-05, "loss": 0.0928, "step": 7369 }, { "epoch": 0.573792029584935, "grad_norm": 0.8066217500829255, "learning_rate": 1.5937679841013447e-05, "loss": 0.0926, "step": 7370 }, { "epoch": 0.5738698846771446, "grad_norm": 0.7415654003799405, "learning_rate": 1.5932792218093524e-05, "loss": 0.0785, "step": 7371 }, { "epoch": 0.5739477397693543, "grad_norm": 0.7719145121223441, "learning_rate": 1.5927904848541528e-05, "loss": 0.0908, "step": 7372 }, { "epoch": 0.5740255948615639, "grad_norm": 0.7946909946521803, "learning_rate": 1.5923017732661917e-05, "loss": 0.096, "step": 7373 }, { "epoch": 0.5741034499537735, "grad_norm": 0.7949919483832688, "learning_rate": 1.5918130870759144e-05, "loss": 0.0793, "step": 7374 }, { "epoch": 0.5741813050459832, "grad_norm": 0.6532555669028874, "learning_rate": 1.5913244263137618e-05, "loss": 0.0513, "step": 7375 }, { "epoch": 0.5742591601381928, "grad_norm": 0.76246767920877, "learning_rate": 1.590835791010177e-05, "loss": 0.0894, "step": 7376 }, { "epoch": 0.5743370152304024, "grad_norm": 0.6830191540174586, "learning_rate": 1.5903471811956e-05, "loss": 0.0618, "step": 7377 }, { "epoch": 0.5744148703226121, "grad_norm": 0.7551671904382429, "learning_rate": 1.5898585969004675e-05, "loss": 0.077, "step": 7378 }, { "epoch": 0.5744927254148217, "grad_norm": 0.7328895004170757, "learning_rate": 1.589370038155217e-05, "loss": 0.0766, "step": 7379 }, { "epoch": 0.5745705805070312, "grad_norm": 0.8361687988496654, "learning_rate": 1.5888815049902818e-05, "loss": 0.113, "step": 7380 }, { "epoch": 0.574648435599241, "grad_norm": 0.8349165911988289, "learning_rate": 1.5883929974360974e-05, "loss": 0.1135, "step": 7381 }, { "epoch": 0.5747262906914505, "grad_norm": 0.8548869675435299, "learning_rate": 1.5879045155230946e-05, "loss": 0.112, "step": 7382 }, { "epoch": 0.5748041457836601, "grad_norm": 0.7535053216084571, "learning_rate": 1.5874160592817037e-05, "loss": 0.0619, "step": 7383 }, { "epoch": 0.5748820008758698, "grad_norm": 0.74111599731442, "learning_rate": 1.5869276287423526e-05, "loss": 0.0657, "step": 7384 }, { "epoch": 0.5749598559680794, "grad_norm": 0.8519932596556112, "learning_rate": 1.5864392239354677e-05, "loss": 0.0959, "step": 7385 }, { "epoch": 0.575037711060289, "grad_norm": 0.7253765426170153, "learning_rate": 1.585950844891476e-05, "loss": 0.0694, "step": 7386 }, { "epoch": 0.5751155661524987, "grad_norm": 0.7501850045981756, "learning_rate": 1.5854624916408006e-05, "loss": 0.0726, "step": 7387 }, { "epoch": 0.5751934212447083, "grad_norm": 0.8615734966253383, "learning_rate": 1.584974164213864e-05, "loss": 0.108, "step": 7388 }, { "epoch": 0.5752712763369179, "grad_norm": 0.7326593804249671, "learning_rate": 1.5844858626410856e-05, "loss": 0.0801, "step": 7389 }, { "epoch": 0.5753491314291276, "grad_norm": 0.7312473201464238, "learning_rate": 1.5839975869528844e-05, "loss": 0.079, "step": 7390 }, { "epoch": 0.5754269865213372, "grad_norm": 0.6967686401623455, "learning_rate": 1.5835093371796785e-05, "loss": 0.0622, "step": 7391 }, { "epoch": 0.5755048416135468, "grad_norm": 0.828041536573015, "learning_rate": 1.5830211133518837e-05, "loss": 0.1214, "step": 7392 }, { "epoch": 0.5755826967057565, "grad_norm": 0.809081706815699, "learning_rate": 1.5825329154999133e-05, "loss": 0.0807, "step": 7393 }, { "epoch": 0.575660551797966, "grad_norm": 0.7460075524763733, "learning_rate": 1.5820447436541798e-05, "loss": 0.1107, "step": 7394 }, { "epoch": 0.5757384068901756, "grad_norm": 0.7741566353485723, "learning_rate": 1.5815565978450938e-05, "loss": 0.0967, "step": 7395 }, { "epoch": 0.5758162619823853, "grad_norm": 0.7108804943093081, "learning_rate": 1.5810684781030653e-05, "loss": 0.0706, "step": 7396 }, { "epoch": 0.5758941170745949, "grad_norm": 0.7824268729091227, "learning_rate": 1.580580384458502e-05, "loss": 0.0933, "step": 7397 }, { "epoch": 0.5759719721668045, "grad_norm": 0.7497231139999522, "learning_rate": 1.5800923169418088e-05, "loss": 0.0789, "step": 7398 }, { "epoch": 0.5760498272590142, "grad_norm": 0.6985874530953803, "learning_rate": 1.579604275583391e-05, "loss": 0.0734, "step": 7399 }, { "epoch": 0.5761276823512238, "grad_norm": 0.8445218457774402, "learning_rate": 1.5791162604136504e-05, "loss": 0.0811, "step": 7400 }, { "epoch": 0.5761276823512238, "eval_loss": 0.010668785311281681, "eval_runtime": 162.2821, "eval_samples_per_second": 17.747, "eval_steps_per_second": 0.635, "step": 7400 }, { "epoch": 0.5762055374434334, "grad_norm": 0.7308706839497222, "learning_rate": 1.578628271462988e-05, "loss": 0.0785, "step": 7401 }, { "epoch": 0.576283392535643, "grad_norm": 0.7776036048884116, "learning_rate": 1.5781403087618043e-05, "loss": 0.0793, "step": 7402 }, { "epoch": 0.5763612476278527, "grad_norm": 0.7728659850792007, "learning_rate": 1.5776523723404958e-05, "loss": 0.0932, "step": 7403 }, { "epoch": 0.5764391027200623, "grad_norm": 0.7347927430533326, "learning_rate": 1.5771644622294598e-05, "loss": 0.1084, "step": 7404 }, { "epoch": 0.5765169578122719, "grad_norm": 0.8530680969205929, "learning_rate": 1.5766765784590905e-05, "loss": 0.0943, "step": 7405 }, { "epoch": 0.5765948129044816, "grad_norm": 0.635300824982781, "learning_rate": 1.57618872105978e-05, "loss": 0.0564, "step": 7406 }, { "epoch": 0.5766726679966911, "grad_norm": 0.7726965156933059, "learning_rate": 1.5757008900619203e-05, "loss": 0.1035, "step": 7407 }, { "epoch": 0.5767505230889007, "grad_norm": 0.7948625127498641, "learning_rate": 1.575213085495901e-05, "loss": 0.0974, "step": 7408 }, { "epoch": 0.5768283781811104, "grad_norm": 0.7791856931446929, "learning_rate": 1.5747253073921096e-05, "loss": 0.0739, "step": 7409 }, { "epoch": 0.57690623327332, "grad_norm": 0.7166512482360501, "learning_rate": 1.5742375557809328e-05, "loss": 0.0889, "step": 7410 }, { "epoch": 0.5769840883655296, "grad_norm": 0.8011577090755906, "learning_rate": 1.5737498306927546e-05, "loss": 0.069, "step": 7411 }, { "epoch": 0.5770619434577393, "grad_norm": 0.7620339126870136, "learning_rate": 1.5732621321579585e-05, "loss": 0.0827, "step": 7412 }, { "epoch": 0.5771397985499489, "grad_norm": 0.7825740899934029, "learning_rate": 1.572774460206926e-05, "loss": 0.0885, "step": 7413 }, { "epoch": 0.5772176536421585, "grad_norm": 0.6780866620511331, "learning_rate": 1.5722868148700365e-05, "loss": 0.0602, "step": 7414 }, { "epoch": 0.5772955087343682, "grad_norm": 0.6882015215595166, "learning_rate": 1.571799196177668e-05, "loss": 0.0642, "step": 7415 }, { "epoch": 0.5773733638265778, "grad_norm": 0.7772709698879383, "learning_rate": 1.571311604160196e-05, "loss": 0.0653, "step": 7416 }, { "epoch": 0.5774512189187874, "grad_norm": 0.7897238924983411, "learning_rate": 1.5708240388479968e-05, "loss": 0.0943, "step": 7417 }, { "epoch": 0.5775290740109971, "grad_norm": 0.8636644076864481, "learning_rate": 1.570336500271443e-05, "loss": 0.099, "step": 7418 }, { "epoch": 0.5776069291032067, "grad_norm": 0.75018567603411, "learning_rate": 1.5698489884609052e-05, "loss": 0.0889, "step": 7419 }, { "epoch": 0.5776847841954162, "grad_norm": 0.8398414740249658, "learning_rate": 1.569361503446754e-05, "loss": 0.0975, "step": 7420 }, { "epoch": 0.577762639287626, "grad_norm": 0.8144724240335673, "learning_rate": 1.5688740452593557e-05, "loss": 0.1003, "step": 7421 }, { "epoch": 0.5778404943798355, "grad_norm": 0.7787206941183917, "learning_rate": 1.568386613929079e-05, "loss": 0.0977, "step": 7422 }, { "epoch": 0.5779183494720451, "grad_norm": 0.7766571713835807, "learning_rate": 1.5678992094862874e-05, "loss": 0.0743, "step": 7423 }, { "epoch": 0.5779962045642548, "grad_norm": 0.746874680661105, "learning_rate": 1.5674118319613438e-05, "loss": 0.0783, "step": 7424 }, { "epoch": 0.5780740596564644, "grad_norm": 0.6952509631294177, "learning_rate": 1.5669244813846097e-05, "loss": 0.0715, "step": 7425 }, { "epoch": 0.578151914748674, "grad_norm": 0.7551574833540882, "learning_rate": 1.5664371577864446e-05, "loss": 0.0762, "step": 7426 }, { "epoch": 0.5782297698408837, "grad_norm": 0.6889057215175229, "learning_rate": 1.565949861197207e-05, "loss": 0.0659, "step": 7427 }, { "epoch": 0.5783076249330933, "grad_norm": 0.7331192732886922, "learning_rate": 1.5654625916472528e-05, "loss": 0.0966, "step": 7428 }, { "epoch": 0.5783854800253029, "grad_norm": 0.8233435234233729, "learning_rate": 1.5649753491669367e-05, "loss": 0.1063, "step": 7429 }, { "epoch": 0.5784633351175126, "grad_norm": 0.8079037487498323, "learning_rate": 1.5644881337866113e-05, "loss": 0.0903, "step": 7430 }, { "epoch": 0.5785411902097222, "grad_norm": 0.8537597463253854, "learning_rate": 1.564000945536628e-05, "loss": 0.1474, "step": 7431 }, { "epoch": 0.5786190453019318, "grad_norm": 0.652853702090627, "learning_rate": 1.5635137844473366e-05, "loss": 0.0753, "step": 7432 }, { "epoch": 0.5786969003941415, "grad_norm": 0.750948818330327, "learning_rate": 1.5630266505490847e-05, "loss": 0.0857, "step": 7433 }, { "epoch": 0.578774755486351, "grad_norm": 0.8321791530147016, "learning_rate": 1.562539543872219e-05, "loss": 0.0993, "step": 7434 }, { "epoch": 0.5788526105785606, "grad_norm": 0.9641217773124104, "learning_rate": 1.5620524644470828e-05, "loss": 0.0796, "step": 7435 }, { "epoch": 0.5789304656707703, "grad_norm": 0.747121201401147, "learning_rate": 1.5615654123040204e-05, "loss": 0.0835, "step": 7436 }, { "epoch": 0.5790083207629799, "grad_norm": 0.7488826364406016, "learning_rate": 1.5610783874733706e-05, "loss": 0.0996, "step": 7437 }, { "epoch": 0.5790861758551895, "grad_norm": 0.7606983989180345, "learning_rate": 1.560591389985475e-05, "loss": 0.0931, "step": 7438 }, { "epoch": 0.5791640309473991, "grad_norm": 0.841135890997045, "learning_rate": 1.56010441987067e-05, "loss": 0.101, "step": 7439 }, { "epoch": 0.5792418860396088, "grad_norm": 0.7117292203501567, "learning_rate": 1.5596174771592927e-05, "loss": 0.0562, "step": 7440 }, { "epoch": 0.5793197411318184, "grad_norm": 0.8919298335816239, "learning_rate": 1.5591305618816767e-05, "loss": 0.1111, "step": 7441 }, { "epoch": 0.579397596224028, "grad_norm": 0.8688245136845049, "learning_rate": 1.558643674068153e-05, "loss": 0.103, "step": 7442 }, { "epoch": 0.5794754513162377, "grad_norm": 0.8133104258725236, "learning_rate": 1.558156813749055e-05, "loss": 0.0901, "step": 7443 }, { "epoch": 0.5795533064084473, "grad_norm": 0.7974177022736354, "learning_rate": 1.55766998095471e-05, "loss": 0.0946, "step": 7444 }, { "epoch": 0.5796311615006569, "grad_norm": 0.8467058488084332, "learning_rate": 1.557183175715447e-05, "loss": 0.1299, "step": 7445 }, { "epoch": 0.5797090165928666, "grad_norm": 0.726250803600422, "learning_rate": 1.5566963980615902e-05, "loss": 0.0622, "step": 7446 }, { "epoch": 0.5797868716850761, "grad_norm": 0.8275828059312688, "learning_rate": 1.5562096480234634e-05, "loss": 0.1112, "step": 7447 }, { "epoch": 0.5798647267772857, "grad_norm": 0.7344739230099243, "learning_rate": 1.5557229256313905e-05, "loss": 0.0923, "step": 7448 }, { "epoch": 0.5799425818694954, "grad_norm": 0.8308590416663969, "learning_rate": 1.5552362309156905e-05, "loss": 0.0994, "step": 7449 }, { "epoch": 0.580020436961705, "grad_norm": 0.7512503805005603, "learning_rate": 1.5547495639066836e-05, "loss": 0.0777, "step": 7450 }, { "epoch": 0.580020436961705, "eval_loss": 0.010688990354537964, "eval_runtime": 161.8455, "eval_samples_per_second": 17.795, "eval_steps_per_second": 0.636, "step": 7450 }, { "epoch": 0.5800982920539146, "grad_norm": 0.7270894581477333, "learning_rate": 1.5542629246346853e-05, "loss": 0.0645, "step": 7451 }, { "epoch": 0.5801761471461243, "grad_norm": 0.7450719024955554, "learning_rate": 1.553776313130011e-05, "loss": 0.0769, "step": 7452 }, { "epoch": 0.5802540022383339, "grad_norm": 0.7018064729977176, "learning_rate": 1.553289729422976e-05, "loss": 0.0736, "step": 7453 }, { "epoch": 0.5803318573305435, "grad_norm": 0.6961756335824179, "learning_rate": 1.552803173543891e-05, "loss": 0.0644, "step": 7454 }, { "epoch": 0.5804097124227532, "grad_norm": 0.8144864930842953, "learning_rate": 1.5523166455230662e-05, "loss": 0.0848, "step": 7455 }, { "epoch": 0.5804875675149628, "grad_norm": 0.7605916069221651, "learning_rate": 1.5518301453908103e-05, "loss": 0.0803, "step": 7456 }, { "epoch": 0.5805654226071724, "grad_norm": 0.8033366752563132, "learning_rate": 1.5513436731774285e-05, "loss": 0.0913, "step": 7457 }, { "epoch": 0.5806432776993821, "grad_norm": 0.7557789974158884, "learning_rate": 1.550857228913228e-05, "loss": 0.0909, "step": 7458 }, { "epoch": 0.5807211327915917, "grad_norm": 0.7971708532738907, "learning_rate": 1.550370812628511e-05, "loss": 0.0784, "step": 7459 }, { "epoch": 0.5807989878838012, "grad_norm": 0.9021646276294231, "learning_rate": 1.5498844243535794e-05, "loss": 0.1407, "step": 7460 }, { "epoch": 0.5808768429760109, "grad_norm": 0.7495859698752028, "learning_rate": 1.549398064118732e-05, "loss": 0.0857, "step": 7461 }, { "epoch": 0.5809546980682205, "grad_norm": 0.8294068150923162, "learning_rate": 1.548911731954266e-05, "loss": 0.0898, "step": 7462 }, { "epoch": 0.5810325531604301, "grad_norm": 0.7452417662438247, "learning_rate": 1.54842542789048e-05, "loss": 0.074, "step": 7463 }, { "epoch": 0.5811104082526398, "grad_norm": 0.7281830351018127, "learning_rate": 1.5479391519576676e-05, "loss": 0.0561, "step": 7464 }, { "epoch": 0.5811882633448494, "grad_norm": 0.7293438388556757, "learning_rate": 1.5474529041861207e-05, "loss": 0.0717, "step": 7465 }, { "epoch": 0.581266118437059, "grad_norm": 0.7472789492815944, "learning_rate": 1.5469666846061308e-05, "loss": 0.0633, "step": 7466 }, { "epoch": 0.5813439735292687, "grad_norm": 0.7843573016980145, "learning_rate": 1.546480493247986e-05, "loss": 0.0836, "step": 7467 }, { "epoch": 0.5814218286214783, "grad_norm": 0.8226031216794741, "learning_rate": 1.5459943301419763e-05, "loss": 0.0839, "step": 7468 }, { "epoch": 0.5814996837136879, "grad_norm": 0.8563492714270605, "learning_rate": 1.5455081953183853e-05, "loss": 0.0857, "step": 7469 }, { "epoch": 0.5815775388058976, "grad_norm": 0.7532691930356374, "learning_rate": 1.545022088807497e-05, "loss": 0.0782, "step": 7470 }, { "epoch": 0.5816553938981072, "grad_norm": 0.7847027248845306, "learning_rate": 1.5445360106395944e-05, "loss": 0.0969, "step": 7471 }, { "epoch": 0.5817332489903168, "grad_norm": 0.7426453694988635, "learning_rate": 1.5440499608449566e-05, "loss": 0.0829, "step": 7472 }, { "epoch": 0.5818111040825263, "grad_norm": 0.6968625958072412, "learning_rate": 1.5435639394538645e-05, "loss": 0.0634, "step": 7473 }, { "epoch": 0.581888959174736, "grad_norm": 0.8133425854976916, "learning_rate": 1.5430779464965926e-05, "loss": 0.0814, "step": 7474 }, { "epoch": 0.5819668142669456, "grad_norm": 0.7066639349955441, "learning_rate": 1.5425919820034175e-05, "loss": 0.0751, "step": 7475 }, { "epoch": 0.5820446693591552, "grad_norm": 0.7669935029435345, "learning_rate": 1.5421060460046116e-05, "loss": 0.0843, "step": 7476 }, { "epoch": 0.5821225244513649, "grad_norm": 0.7713419424686881, "learning_rate": 1.541620138530447e-05, "loss": 0.0901, "step": 7477 }, { "epoch": 0.5822003795435745, "grad_norm": 0.7514295087150167, "learning_rate": 1.5411342596111923e-05, "loss": 0.0841, "step": 7478 }, { "epoch": 0.5822782346357841, "grad_norm": 0.6663153785408781, "learning_rate": 1.540648409277117e-05, "loss": 0.0642, "step": 7479 }, { "epoch": 0.5823560897279938, "grad_norm": 0.8082664953556586, "learning_rate": 1.5401625875584866e-05, "loss": 0.0959, "step": 7480 }, { "epoch": 0.5824339448202034, "grad_norm": 0.8012635023808771, "learning_rate": 1.539676794485566e-05, "loss": 0.0684, "step": 7481 }, { "epoch": 0.582511799912413, "grad_norm": 0.7616080272401393, "learning_rate": 1.539191030088617e-05, "loss": 0.0969, "step": 7482 }, { "epoch": 0.5825896550046227, "grad_norm": 0.890848428429058, "learning_rate": 1.5387052943979e-05, "loss": 0.155, "step": 7483 }, { "epoch": 0.5826675100968323, "grad_norm": 0.6578206013695655, "learning_rate": 1.538219587443676e-05, "loss": 0.06, "step": 7484 }, { "epoch": 0.5827453651890419, "grad_norm": 0.69178675025932, "learning_rate": 1.5377339092562008e-05, "loss": 0.0613, "step": 7485 }, { "epoch": 0.5828232202812516, "grad_norm": 0.7403750780775866, "learning_rate": 1.5372482598657306e-05, "loss": 0.0657, "step": 7486 }, { "epoch": 0.5829010753734611, "grad_norm": 0.8125361608761508, "learning_rate": 1.5367626393025184e-05, "loss": 0.1153, "step": 7487 }, { "epoch": 0.5829789304656707, "grad_norm": 0.834161179874514, "learning_rate": 1.5362770475968158e-05, "loss": 0.0944, "step": 7488 }, { "epoch": 0.5830567855578804, "grad_norm": 0.8180063237999295, "learning_rate": 1.535791484778874e-05, "loss": 0.0876, "step": 7489 }, { "epoch": 0.58313464065009, "grad_norm": 0.6773872883912512, "learning_rate": 1.5353059508789406e-05, "loss": 0.0624, "step": 7490 }, { "epoch": 0.5832124957422996, "grad_norm": 0.7046336462018219, "learning_rate": 1.5348204459272625e-05, "loss": 0.0743, "step": 7491 }, { "epoch": 0.5832903508345093, "grad_norm": 0.8835378242547367, "learning_rate": 1.5343349699540838e-05, "loss": 0.1079, "step": 7492 }, { "epoch": 0.5833682059267189, "grad_norm": 0.7639475011308794, "learning_rate": 1.5338495229896474e-05, "loss": 0.0793, "step": 7493 }, { "epoch": 0.5834460610189285, "grad_norm": 0.7355135201120395, "learning_rate": 1.5333641050641952e-05, "loss": 0.1039, "step": 7494 }, { "epoch": 0.5835239161111382, "grad_norm": 0.8045010397645684, "learning_rate": 1.5328787162079655e-05, "loss": 0.075, "step": 7495 }, { "epoch": 0.5836017712033478, "grad_norm": 0.7669176515342658, "learning_rate": 1.5323933564511963e-05, "loss": 0.0888, "step": 7496 }, { "epoch": 0.5836796262955574, "grad_norm": 0.6715531998288672, "learning_rate": 1.5319080258241232e-05, "loss": 0.0682, "step": 7497 }, { "epoch": 0.5837574813877671, "grad_norm": 0.7008019945999283, "learning_rate": 1.5314227243569793e-05, "loss": 0.0694, "step": 7498 }, { "epoch": 0.5838353364799767, "grad_norm": 0.84446939310265, "learning_rate": 1.5309374520799975e-05, "loss": 0.0754, "step": 7499 }, { "epoch": 0.5839131915721862, "grad_norm": 0.8928644966645615, "learning_rate": 1.5304522090234077e-05, "loss": 0.1099, "step": 7500 }, { "epoch": 0.5839131915721862, "eval_loss": 0.010454021394252777, "eval_runtime": 162.7896, "eval_samples_per_second": 17.692, "eval_steps_per_second": 0.633, "step": 7500 }, { "epoch": 0.5839910466643959, "grad_norm": 0.6901499542279322, "learning_rate": 1.5299669952174382e-05, "loss": 0.0565, "step": 7501 }, { "epoch": 0.5840689017566055, "grad_norm": 0.8532946942572557, "learning_rate": 1.5294818106923158e-05, "loss": 0.0945, "step": 7502 }, { "epoch": 0.5841467568488151, "grad_norm": 0.7368002464124698, "learning_rate": 1.528996655478264e-05, "loss": 0.0523, "step": 7503 }, { "epoch": 0.5842246119410248, "grad_norm": 0.7342293589021504, "learning_rate": 1.5285115296055076e-05, "loss": 0.0891, "step": 7504 }, { "epoch": 0.5843024670332344, "grad_norm": 0.7508405298301161, "learning_rate": 1.5280264331042666e-05, "loss": 0.099, "step": 7505 }, { "epoch": 0.584380322125444, "grad_norm": 0.7429080595645063, "learning_rate": 1.52754136600476e-05, "loss": 0.0647, "step": 7506 }, { "epoch": 0.5844581772176537, "grad_norm": 0.783081793223521, "learning_rate": 1.5270563283372058e-05, "loss": 0.0766, "step": 7507 }, { "epoch": 0.5845360323098633, "grad_norm": 0.851915337275803, "learning_rate": 1.5265713201318187e-05, "loss": 0.1037, "step": 7508 }, { "epoch": 0.5846138874020729, "grad_norm": 0.6396977859438489, "learning_rate": 1.5260863414188137e-05, "loss": 0.0622, "step": 7509 }, { "epoch": 0.5846917424942825, "grad_norm": 0.7243008529414542, "learning_rate": 1.5256013922284022e-05, "loss": 0.0783, "step": 7510 }, { "epoch": 0.5847695975864922, "grad_norm": 0.6361812642326833, "learning_rate": 1.525116472590794e-05, "loss": 0.0651, "step": 7511 }, { "epoch": 0.5848474526787018, "grad_norm": 0.7102463053361856, "learning_rate": 1.5246315825361975e-05, "loss": 0.0652, "step": 7512 }, { "epoch": 0.5849253077709113, "grad_norm": 0.8042942943335222, "learning_rate": 1.5241467220948188e-05, "loss": 0.1055, "step": 7513 }, { "epoch": 0.585003162863121, "grad_norm": 0.7474988690293304, "learning_rate": 1.5236618912968618e-05, "loss": 0.0829, "step": 7514 }, { "epoch": 0.5850810179553306, "grad_norm": 0.7576227989923217, "learning_rate": 1.5231770901725307e-05, "loss": 0.0681, "step": 7515 }, { "epoch": 0.5851588730475402, "grad_norm": 0.8873699908862265, "learning_rate": 1.5226923187520255e-05, "loss": 0.1166, "step": 7516 }, { "epoch": 0.5852367281397499, "grad_norm": 0.7651131892510917, "learning_rate": 1.522207577065546e-05, "loss": 0.0984, "step": 7517 }, { "epoch": 0.5853145832319595, "grad_norm": 0.6761980167690504, "learning_rate": 1.5217228651432883e-05, "loss": 0.067, "step": 7518 }, { "epoch": 0.5853924383241691, "grad_norm": 0.7582165504562677, "learning_rate": 1.5212381830154467e-05, "loss": 0.0761, "step": 7519 }, { "epoch": 0.5854702934163788, "grad_norm": 0.901346657834289, "learning_rate": 1.520753530712217e-05, "loss": 0.0984, "step": 7520 }, { "epoch": 0.5855481485085884, "grad_norm": 0.8415010763500979, "learning_rate": 1.5202689082637902e-05, "loss": 0.1011, "step": 7521 }, { "epoch": 0.585626003600798, "grad_norm": 0.7531656931743139, "learning_rate": 1.5197843157003548e-05, "loss": 0.0779, "step": 7522 }, { "epoch": 0.5857038586930077, "grad_norm": 0.6949805471492672, "learning_rate": 1.5192997530520996e-05, "loss": 0.0891, "step": 7523 }, { "epoch": 0.5857817137852173, "grad_norm": 0.8416667671469709, "learning_rate": 1.5188152203492095e-05, "loss": 0.0914, "step": 7524 }, { "epoch": 0.5858595688774269, "grad_norm": 0.7295450727437358, "learning_rate": 1.51833071762187e-05, "loss": 0.0864, "step": 7525 }, { "epoch": 0.5859374239696366, "grad_norm": 0.783307535138893, "learning_rate": 1.5178462449002633e-05, "loss": 0.1061, "step": 7526 }, { "epoch": 0.5860152790618461, "grad_norm": 0.7102922644462145, "learning_rate": 1.5173618022145689e-05, "loss": 0.068, "step": 7527 }, { "epoch": 0.5860931341540557, "grad_norm": 0.7051871862141347, "learning_rate": 1.5168773895949653e-05, "loss": 0.0643, "step": 7528 }, { "epoch": 0.5861709892462654, "grad_norm": 0.7654427548095093, "learning_rate": 1.5163930070716288e-05, "loss": 0.0671, "step": 7529 }, { "epoch": 0.586248844338475, "grad_norm": 0.7320760352257996, "learning_rate": 1.5159086546747362e-05, "loss": 0.0741, "step": 7530 }, { "epoch": 0.5863266994306846, "grad_norm": 0.7229416618835762, "learning_rate": 1.5154243324344581e-05, "loss": 0.0964, "step": 7531 }, { "epoch": 0.5864045545228943, "grad_norm": 0.677623609391544, "learning_rate": 1.5149400403809669e-05, "loss": 0.0828, "step": 7532 }, { "epoch": 0.5864824096151039, "grad_norm": 0.6944411111280088, "learning_rate": 1.5144557785444309e-05, "loss": 0.077, "step": 7533 }, { "epoch": 0.5865602647073135, "grad_norm": 0.7176494244630243, "learning_rate": 1.513971546955017e-05, "loss": 0.0675, "step": 7534 }, { "epoch": 0.5866381197995232, "grad_norm": 0.669233813381141, "learning_rate": 1.5134873456428922e-05, "loss": 0.0628, "step": 7535 }, { "epoch": 0.5867159748917328, "grad_norm": 0.8095599745467023, "learning_rate": 1.5130031746382188e-05, "loss": 0.1336, "step": 7536 }, { "epoch": 0.5867938299839424, "grad_norm": 0.6836899089526761, "learning_rate": 1.5125190339711584e-05, "loss": 0.0663, "step": 7537 }, { "epoch": 0.5868716850761521, "grad_norm": 0.684565291908518, "learning_rate": 1.512034923671871e-05, "loss": 0.0752, "step": 7538 }, { "epoch": 0.5869495401683616, "grad_norm": 0.6910072501985867, "learning_rate": 1.5115508437705136e-05, "loss": 0.0703, "step": 7539 }, { "epoch": 0.5870273952605712, "grad_norm": 0.7977171665052991, "learning_rate": 1.5110667942972433e-05, "loss": 0.0889, "step": 7540 }, { "epoch": 0.5871052503527809, "grad_norm": 0.6851540430674645, "learning_rate": 1.510582775282213e-05, "loss": 0.0488, "step": 7541 }, { "epoch": 0.5871831054449905, "grad_norm": 0.7815705731688012, "learning_rate": 1.510098786755576e-05, "loss": 0.0811, "step": 7542 }, { "epoch": 0.5872609605372001, "grad_norm": 0.7325117825979072, "learning_rate": 1.5096148287474814e-05, "loss": 0.0667, "step": 7543 }, { "epoch": 0.5873388156294098, "grad_norm": 0.7165336599719726, "learning_rate": 1.509130901288078e-05, "loss": 0.0878, "step": 7544 }, { "epoch": 0.5874166707216194, "grad_norm": 0.7372308170040894, "learning_rate": 1.508647004407512e-05, "loss": 0.0701, "step": 7545 }, { "epoch": 0.587494525813829, "grad_norm": 0.8178412019234703, "learning_rate": 1.5081631381359286e-05, "loss": 0.1028, "step": 7546 }, { "epoch": 0.5875723809060386, "grad_norm": 0.8341186531253802, "learning_rate": 1.5076793025034694e-05, "loss": 0.1033, "step": 7547 }, { "epoch": 0.5876502359982483, "grad_norm": 0.8264745048880586, "learning_rate": 1.5071954975402759e-05, "loss": 0.0997, "step": 7548 }, { "epoch": 0.5877280910904579, "grad_norm": 0.7741263487125178, "learning_rate": 1.506711723276486e-05, "loss": 0.0749, "step": 7549 }, { "epoch": 0.5878059461826675, "grad_norm": 0.6759250744237428, "learning_rate": 1.5062279797422374e-05, "loss": 0.0656, "step": 7550 }, { "epoch": 0.5878059461826675, "eval_loss": 0.010222531855106354, "eval_runtime": 162.3076, "eval_samples_per_second": 17.744, "eval_steps_per_second": 0.635, "step": 7550 }, { "epoch": 0.5878838012748772, "grad_norm": 0.7206429537817344, "learning_rate": 1.5057442669676648e-05, "loss": 0.0815, "step": 7551 }, { "epoch": 0.5879616563670867, "grad_norm": 0.7364714930523254, "learning_rate": 1.5052605849829012e-05, "loss": 0.0786, "step": 7552 }, { "epoch": 0.5880395114592963, "grad_norm": 0.6831492318166733, "learning_rate": 1.5047769338180776e-05, "loss": 0.0681, "step": 7553 }, { "epoch": 0.588117366551506, "grad_norm": 0.7235906511757225, "learning_rate": 1.5042933135033233e-05, "loss": 0.0765, "step": 7554 }, { "epoch": 0.5881952216437156, "grad_norm": 0.7824914314237419, "learning_rate": 1.503809724068765e-05, "loss": 0.0792, "step": 7555 }, { "epoch": 0.5882730767359252, "grad_norm": 0.7608055238564739, "learning_rate": 1.5033261655445293e-05, "loss": 0.0848, "step": 7556 }, { "epoch": 0.5883509318281349, "grad_norm": 0.8062192373167225, "learning_rate": 1.5028426379607386e-05, "loss": 0.0942, "step": 7557 }, { "epoch": 0.5884287869203445, "grad_norm": 0.8084021194272805, "learning_rate": 1.5023591413475149e-05, "loss": 0.0905, "step": 7558 }, { "epoch": 0.5885066420125541, "grad_norm": 0.7977054572348677, "learning_rate": 1.5018756757349775e-05, "loss": 0.0933, "step": 7559 }, { "epoch": 0.5885844971047638, "grad_norm": 0.8035522794033086, "learning_rate": 1.5013922411532439e-05, "loss": 0.0851, "step": 7560 }, { "epoch": 0.5886623521969734, "grad_norm": 0.7533598579050361, "learning_rate": 1.5009088376324301e-05, "loss": 0.0972, "step": 7561 }, { "epoch": 0.588740207289183, "grad_norm": 0.6768465766319215, "learning_rate": 1.50042546520265e-05, "loss": 0.0558, "step": 7562 }, { "epoch": 0.5888180623813927, "grad_norm": 0.7981119883994867, "learning_rate": 1.499942123894015e-05, "loss": 0.1052, "step": 7563 }, { "epoch": 0.5888959174736023, "grad_norm": 0.8095018699735833, "learning_rate": 1.4994588137366353e-05, "loss": 0.1042, "step": 7564 }, { "epoch": 0.5889737725658118, "grad_norm": 0.7998364725335735, "learning_rate": 1.4989755347606182e-05, "loss": 0.0892, "step": 7565 }, { "epoch": 0.5890516276580215, "grad_norm": 0.6610652920155559, "learning_rate": 1.4984922869960713e-05, "loss": 0.0546, "step": 7566 }, { "epoch": 0.5891294827502311, "grad_norm": 0.640703316607648, "learning_rate": 1.498009070473097e-05, "loss": 0.0713, "step": 7567 }, { "epoch": 0.5892073378424407, "grad_norm": 0.7010964150698433, "learning_rate": 1.4975258852217982e-05, "loss": 0.08, "step": 7568 }, { "epoch": 0.5892851929346504, "grad_norm": 0.7311443121750542, "learning_rate": 1.4970427312722748e-05, "loss": 0.0835, "step": 7569 }, { "epoch": 0.58936304802686, "grad_norm": 0.7051746276462529, "learning_rate": 1.496559608654625e-05, "loss": 0.0578, "step": 7570 }, { "epoch": 0.5894409031190696, "grad_norm": 0.6715250484213705, "learning_rate": 1.4960765173989456e-05, "loss": 0.0589, "step": 7571 }, { "epoch": 0.5895187582112793, "grad_norm": 0.7250787398003449, "learning_rate": 1.49559345753533e-05, "loss": 0.0917, "step": 7572 }, { "epoch": 0.5895966133034889, "grad_norm": 0.7897162200329946, "learning_rate": 1.4951104290938717e-05, "loss": 0.1136, "step": 7573 }, { "epoch": 0.5896744683956985, "grad_norm": 0.6668847728639468, "learning_rate": 1.4946274321046605e-05, "loss": 0.0624, "step": 7574 }, { "epoch": 0.5897523234879082, "grad_norm": 0.7326444833812371, "learning_rate": 1.494144466597784e-05, "loss": 0.0757, "step": 7575 }, { "epoch": 0.5898301785801178, "grad_norm": 0.7143117465276866, "learning_rate": 1.49366153260333e-05, "loss": 0.0847, "step": 7576 }, { "epoch": 0.5899080336723274, "grad_norm": 0.7092955393441451, "learning_rate": 1.4931786301513827e-05, "loss": 0.0668, "step": 7577 }, { "epoch": 0.5899858887645371, "grad_norm": 0.6314067698634496, "learning_rate": 1.4926957592720241e-05, "loss": 0.065, "step": 7578 }, { "epoch": 0.5900637438567466, "grad_norm": 0.7934456159828704, "learning_rate": 1.4922129199953359e-05, "loss": 0.0743, "step": 7579 }, { "epoch": 0.5901415989489562, "grad_norm": 0.7472265717399426, "learning_rate": 1.4917301123513946e-05, "loss": 0.0855, "step": 7580 }, { "epoch": 0.5902194540411658, "grad_norm": 0.7345467448575679, "learning_rate": 1.4912473363702791e-05, "loss": 0.0695, "step": 7581 }, { "epoch": 0.5902973091333755, "grad_norm": 0.7134356824992392, "learning_rate": 1.4907645920820633e-05, "loss": 0.0622, "step": 7582 }, { "epoch": 0.5903751642255851, "grad_norm": 0.6797477520175397, "learning_rate": 1.4902818795168195e-05, "loss": 0.0512, "step": 7583 }, { "epoch": 0.5904530193177947, "grad_norm": 0.7842470525944811, "learning_rate": 1.4897991987046195e-05, "loss": 0.0942, "step": 7584 }, { "epoch": 0.5905308744100044, "grad_norm": 0.6879429514009746, "learning_rate": 1.4893165496755297e-05, "loss": 0.0648, "step": 7585 }, { "epoch": 0.590608729502214, "grad_norm": 0.6884371919677689, "learning_rate": 1.4888339324596194e-05, "loss": 0.0652, "step": 7586 }, { "epoch": 0.5906865845944236, "grad_norm": 0.7306552713030512, "learning_rate": 1.4883513470869523e-05, "loss": 0.068, "step": 7587 }, { "epoch": 0.5907644396866333, "grad_norm": 0.7237287286651455, "learning_rate": 1.4878687935875919e-05, "loss": 0.0878, "step": 7588 }, { "epoch": 0.5908422947788429, "grad_norm": 0.718261625779422, "learning_rate": 1.4873862719915981e-05, "loss": 0.0789, "step": 7589 }, { "epoch": 0.5909201498710525, "grad_norm": 0.6788545849817577, "learning_rate": 1.48690378232903e-05, "loss": 0.0637, "step": 7590 }, { "epoch": 0.5909980049632622, "grad_norm": 0.62928043652742, "learning_rate": 1.4864213246299435e-05, "loss": 0.06, "step": 7591 }, { "epoch": 0.5910758600554717, "grad_norm": 0.6543081949486164, "learning_rate": 1.4859388989243954e-05, "loss": 0.0556, "step": 7592 }, { "epoch": 0.5911537151476813, "grad_norm": 0.74347674377854, "learning_rate": 1.485456505242438e-05, "loss": 0.0766, "step": 7593 }, { "epoch": 0.591231570239891, "grad_norm": 0.7889272400863769, "learning_rate": 1.4849741436141213e-05, "loss": 0.1051, "step": 7594 }, { "epoch": 0.5913094253321006, "grad_norm": 0.6323592846223712, "learning_rate": 1.4844918140694946e-05, "loss": 0.07, "step": 7595 }, { "epoch": 0.5913872804243102, "grad_norm": 0.770149947794993, "learning_rate": 1.484009516638604e-05, "loss": 0.0856, "step": 7596 }, { "epoch": 0.5914651355165199, "grad_norm": 0.7522865786762541, "learning_rate": 1.4835272513514963e-05, "loss": 0.0832, "step": 7597 }, { "epoch": 0.5915429906087295, "grad_norm": 0.7278395502525272, "learning_rate": 1.4830450182382131e-05, "loss": 0.0614, "step": 7598 }, { "epoch": 0.5916208457009391, "grad_norm": 0.6921945215079045, "learning_rate": 1.482562817328795e-05, "loss": 0.0739, "step": 7599 }, { "epoch": 0.5916987007931488, "grad_norm": 0.7800400549227752, "learning_rate": 1.4820806486532813e-05, "loss": 0.1007, "step": 7600 }, { "epoch": 0.5916987007931488, "eval_loss": 0.010034138336777687, "eval_runtime": 161.946, "eval_samples_per_second": 17.784, "eval_steps_per_second": 0.636, "step": 7600 }, { "epoch": 0.5917765558853584, "grad_norm": 0.7829609225319708, "learning_rate": 1.4815985122417078e-05, "loss": 0.078, "step": 7601 }, { "epoch": 0.591854410977568, "grad_norm": 0.7361690559687131, "learning_rate": 1.4811164081241118e-05, "loss": 0.08, "step": 7602 }, { "epoch": 0.5919322660697777, "grad_norm": 0.6848445327031184, "learning_rate": 1.4806343363305242e-05, "loss": 0.0579, "step": 7603 }, { "epoch": 0.5920101211619873, "grad_norm": 0.7179899565962323, "learning_rate": 1.4801522968909759e-05, "loss": 0.066, "step": 7604 }, { "epoch": 0.5920879762541968, "grad_norm": 0.8162954119027496, "learning_rate": 1.4796702898354962e-05, "loss": 0.0754, "step": 7605 }, { "epoch": 0.5921658313464065, "grad_norm": 0.68752356512986, "learning_rate": 1.4791883151941114e-05, "loss": 0.0547, "step": 7606 }, { "epoch": 0.5922436864386161, "grad_norm": 0.7392018918824893, "learning_rate": 1.4787063729968467e-05, "loss": 0.0566, "step": 7607 }, { "epoch": 0.5923215415308257, "grad_norm": 0.7683661790776732, "learning_rate": 1.4782244632737248e-05, "loss": 0.0722, "step": 7608 }, { "epoch": 0.5923993966230354, "grad_norm": 0.6331465793804033, "learning_rate": 1.4777425860547663e-05, "loss": 0.0691, "step": 7609 }, { "epoch": 0.592477251715245, "grad_norm": 0.6782873159686391, "learning_rate": 1.4772607413699898e-05, "loss": 0.0707, "step": 7610 }, { "epoch": 0.5925551068074546, "grad_norm": 0.6827323824784116, "learning_rate": 1.4767789292494117e-05, "loss": 0.0866, "step": 7611 }, { "epoch": 0.5926329618996643, "grad_norm": 0.7976113134258713, "learning_rate": 1.4762971497230471e-05, "loss": 0.0832, "step": 7612 }, { "epoch": 0.5927108169918739, "grad_norm": 0.7377737790822398, "learning_rate": 1.4758154028209087e-05, "loss": 0.1067, "step": 7613 }, { "epoch": 0.5927886720840835, "grad_norm": 0.6548238054799644, "learning_rate": 1.4753336885730068e-05, "loss": 0.0556, "step": 7614 }, { "epoch": 0.5928665271762932, "grad_norm": 0.7812131850515872, "learning_rate": 1.47485200700935e-05, "loss": 0.0994, "step": 7615 }, { "epoch": 0.5929443822685028, "grad_norm": 0.7004357934904158, "learning_rate": 1.474370358159944e-05, "loss": 0.0647, "step": 7616 }, { "epoch": 0.5930222373607124, "grad_norm": 0.6085509649288899, "learning_rate": 1.4738887420547947e-05, "loss": 0.0655, "step": 7617 }, { "epoch": 0.593100092452922, "grad_norm": 0.7473016453039535, "learning_rate": 1.4734071587239039e-05, "loss": 0.068, "step": 7618 }, { "epoch": 0.5931779475451316, "grad_norm": 0.705666191694754, "learning_rate": 1.4729256081972717e-05, "loss": 0.0743, "step": 7619 }, { "epoch": 0.5932558026373412, "grad_norm": 0.696800191551176, "learning_rate": 1.4724440905048967e-05, "loss": 0.073, "step": 7620 }, { "epoch": 0.5933336577295508, "grad_norm": 0.6586950302704737, "learning_rate": 1.4719626056767747e-05, "loss": 0.0654, "step": 7621 }, { "epoch": 0.5934115128217605, "grad_norm": 0.7068844102371011, "learning_rate": 1.471481153742901e-05, "loss": 0.0881, "step": 7622 }, { "epoch": 0.5934893679139701, "grad_norm": 0.7291765125606458, "learning_rate": 1.4709997347332667e-05, "loss": 0.0785, "step": 7623 }, { "epoch": 0.5935672230061797, "grad_norm": 0.6108028176936164, "learning_rate": 1.470518348677863e-05, "loss": 0.0643, "step": 7624 }, { "epoch": 0.5936450780983894, "grad_norm": 0.6925191725721347, "learning_rate": 1.4700369956066771e-05, "loss": 0.1087, "step": 7625 }, { "epoch": 0.593722933190599, "grad_norm": 0.7235995541159186, "learning_rate": 1.4695556755496948e-05, "loss": 0.0643, "step": 7626 }, { "epoch": 0.5938007882828086, "grad_norm": 0.7532387114306648, "learning_rate": 1.469074388536901e-05, "loss": 0.0924, "step": 7627 }, { "epoch": 0.5938786433750183, "grad_norm": 0.6924193463375018, "learning_rate": 1.4685931345982773e-05, "loss": 0.0746, "step": 7628 }, { "epoch": 0.5939564984672279, "grad_norm": 0.7666463539249501, "learning_rate": 1.4681119137638039e-05, "loss": 0.0896, "step": 7629 }, { "epoch": 0.5940343535594375, "grad_norm": 0.7198413513696897, "learning_rate": 1.4676307260634577e-05, "loss": 0.08, "step": 7630 }, { "epoch": 0.5941122086516472, "grad_norm": 0.6430786669606574, "learning_rate": 1.467149571527215e-05, "loss": 0.0718, "step": 7631 }, { "epoch": 0.5941900637438567, "grad_norm": 0.6810738060826964, "learning_rate": 1.466668450185049e-05, "loss": 0.0821, "step": 7632 }, { "epoch": 0.5942679188360663, "grad_norm": 0.8609373897710468, "learning_rate": 1.466187362066932e-05, "loss": 0.1225, "step": 7633 }, { "epoch": 0.594345773928276, "grad_norm": 0.6953749600804524, "learning_rate": 1.4657063072028335e-05, "loss": 0.0631, "step": 7634 }, { "epoch": 0.5944236290204856, "grad_norm": 0.7150349708536192, "learning_rate": 1.4652252856227203e-05, "loss": 0.0737, "step": 7635 }, { "epoch": 0.5945014841126952, "grad_norm": 0.8485799828790342, "learning_rate": 1.4647442973565581e-05, "loss": 0.0966, "step": 7636 }, { "epoch": 0.5945793392049049, "grad_norm": 0.6777163519162276, "learning_rate": 1.4642633424343103e-05, "loss": 0.0733, "step": 7637 }, { "epoch": 0.5946571942971145, "grad_norm": 0.5576067515194871, "learning_rate": 1.463782420885938e-05, "loss": 0.0447, "step": 7638 }, { "epoch": 0.5947350493893241, "grad_norm": 0.6602125701918145, "learning_rate": 1.4633015327414009e-05, "loss": 0.0567, "step": 7639 }, { "epoch": 0.5948129044815338, "grad_norm": 0.7295282010730562, "learning_rate": 1.4628206780306551e-05, "loss": 0.0768, "step": 7640 }, { "epoch": 0.5948907595737434, "grad_norm": 0.6010744225630339, "learning_rate": 1.4623398567836569e-05, "loss": 0.0482, "step": 7641 }, { "epoch": 0.594968614665953, "grad_norm": 0.7166277212318909, "learning_rate": 1.461859069030357e-05, "loss": 0.074, "step": 7642 }, { "epoch": 0.5950464697581627, "grad_norm": 0.7119149752838372, "learning_rate": 1.4613783148007084e-05, "loss": 0.0769, "step": 7643 }, { "epoch": 0.5951243248503723, "grad_norm": 0.7266412801933474, "learning_rate": 1.4608975941246591e-05, "loss": 0.0903, "step": 7644 }, { "epoch": 0.5952021799425818, "grad_norm": 0.7244807463122057, "learning_rate": 1.4604169070321556e-05, "loss": 0.0777, "step": 7645 }, { "epoch": 0.5952800350347915, "grad_norm": 0.6409374460385157, "learning_rate": 1.459936253553143e-05, "loss": 0.0787, "step": 7646 }, { "epoch": 0.5953578901270011, "grad_norm": 0.6801765811678182, "learning_rate": 1.459455633717562e-05, "loss": 0.0747, "step": 7647 }, { "epoch": 0.5954357452192107, "grad_norm": 0.714003932765523, "learning_rate": 1.4589750475553552e-05, "loss": 0.0858, "step": 7648 }, { "epoch": 0.5955136003114204, "grad_norm": 0.695030036833573, "learning_rate": 1.4584944950964598e-05, "loss": 0.0776, "step": 7649 }, { "epoch": 0.59559145540363, "grad_norm": 0.6665278805584617, "learning_rate": 1.4580139763708125e-05, "loss": 0.0727, "step": 7650 }, { "epoch": 0.59559145540363, "eval_loss": 0.009906797669827938, "eval_runtime": 162.1547, "eval_samples_per_second": 17.761, "eval_steps_per_second": 0.635, "step": 7650 }, { "epoch": 0.5956693104958396, "grad_norm": 0.6585005420084997, "learning_rate": 1.4575334914083466e-05, "loss": 0.0651, "step": 7651 }, { "epoch": 0.5957471655880492, "grad_norm": 0.7723701102153109, "learning_rate": 1.4570530402389933e-05, "loss": 0.0933, "step": 7652 }, { "epoch": 0.5958250206802589, "grad_norm": 0.7528359906063721, "learning_rate": 1.4565726228926844e-05, "loss": 0.0902, "step": 7653 }, { "epoch": 0.5959028757724685, "grad_norm": 0.6499971358461506, "learning_rate": 1.4560922393993469e-05, "loss": 0.0648, "step": 7654 }, { "epoch": 0.5959807308646781, "grad_norm": 0.7464172722813796, "learning_rate": 1.4556118897889067e-05, "loss": 0.0965, "step": 7655 }, { "epoch": 0.5960585859568878, "grad_norm": 0.766033422867041, "learning_rate": 1.4551315740912863e-05, "loss": 0.1072, "step": 7656 }, { "epoch": 0.5961364410490974, "grad_norm": 0.6334670036795298, "learning_rate": 1.4546512923364072e-05, "loss": 0.0742, "step": 7657 }, { "epoch": 0.5962142961413069, "grad_norm": 0.7115157016188408, "learning_rate": 1.45417104455419e-05, "loss": 0.0754, "step": 7658 }, { "epoch": 0.5962921512335166, "grad_norm": 0.7543959468703247, "learning_rate": 1.4536908307745511e-05, "loss": 0.0729, "step": 7659 }, { "epoch": 0.5963700063257262, "grad_norm": 0.7033954725472877, "learning_rate": 1.4532106510274056e-05, "loss": 0.0851, "step": 7660 }, { "epoch": 0.5964478614179358, "grad_norm": 0.8197172934235558, "learning_rate": 1.4527305053426663e-05, "loss": 0.1064, "step": 7661 }, { "epoch": 0.5965257165101455, "grad_norm": 0.6652176892331009, "learning_rate": 1.452250393750243e-05, "loss": 0.0705, "step": 7662 }, { "epoch": 0.5966035716023551, "grad_norm": 0.6523445737807203, "learning_rate": 1.4517703162800467e-05, "loss": 0.0553, "step": 7663 }, { "epoch": 0.5966814266945647, "grad_norm": 0.7067125032311806, "learning_rate": 1.451290272961983e-05, "loss": 0.0717, "step": 7664 }, { "epoch": 0.5967592817867744, "grad_norm": 0.6949815272663077, "learning_rate": 1.4508102638259561e-05, "loss": 0.0779, "step": 7665 }, { "epoch": 0.596837136878984, "grad_norm": 0.7756715125661162, "learning_rate": 1.450330288901868e-05, "loss": 0.0757, "step": 7666 }, { "epoch": 0.5969149919711936, "grad_norm": 0.7143809649713819, "learning_rate": 1.449850348219619e-05, "loss": 0.077, "step": 7667 }, { "epoch": 0.5969928470634033, "grad_norm": 0.739807085051668, "learning_rate": 1.4493704418091074e-05, "loss": 0.0809, "step": 7668 }, { "epoch": 0.5970707021556129, "grad_norm": 0.6935919103800167, "learning_rate": 1.4488905697002294e-05, "loss": 0.0811, "step": 7669 }, { "epoch": 0.5971485572478225, "grad_norm": 0.6863421716952209, "learning_rate": 1.4484107319228784e-05, "loss": 0.0782, "step": 7670 }, { "epoch": 0.5972264123400322, "grad_norm": 0.6956547926127682, "learning_rate": 1.447930928506946e-05, "loss": 0.0764, "step": 7671 }, { "epoch": 0.5973042674322417, "grad_norm": 0.7408854058006792, "learning_rate": 1.4474511594823219e-05, "loss": 0.1042, "step": 7672 }, { "epoch": 0.5973821225244513, "grad_norm": 0.6597371432022378, "learning_rate": 1.4469714248788931e-05, "loss": 0.0548, "step": 7673 }, { "epoch": 0.597459977616661, "grad_norm": 0.8343991858125764, "learning_rate": 1.4464917247265452e-05, "loss": 0.1071, "step": 7674 }, { "epoch": 0.5975378327088706, "grad_norm": 0.7250979435013674, "learning_rate": 1.4460120590551613e-05, "loss": 0.0707, "step": 7675 }, { "epoch": 0.5976156878010802, "grad_norm": 0.7932037322264384, "learning_rate": 1.4455324278946221e-05, "loss": 0.091, "step": 7676 }, { "epoch": 0.5976935428932899, "grad_norm": 0.7184523001194479, "learning_rate": 1.4450528312748064e-05, "loss": 0.061, "step": 7677 }, { "epoch": 0.5977713979854995, "grad_norm": 0.7558446305843196, "learning_rate": 1.4445732692255905e-05, "loss": 0.0871, "step": 7678 }, { "epoch": 0.5978492530777091, "grad_norm": 0.7817362027917919, "learning_rate": 1.4440937417768495e-05, "loss": 0.0694, "step": 7679 }, { "epoch": 0.5979271081699188, "grad_norm": 0.726447764069363, "learning_rate": 1.4436142489584554e-05, "loss": 0.0887, "step": 7680 }, { "epoch": 0.5980049632621284, "grad_norm": 0.7388267267066393, "learning_rate": 1.4431347908002784e-05, "loss": 0.0804, "step": 7681 }, { "epoch": 0.598082818354338, "grad_norm": 0.7495197839251012, "learning_rate": 1.4426553673321863e-05, "loss": 0.0933, "step": 7682 }, { "epoch": 0.5981606734465477, "grad_norm": 0.7088659947816008, "learning_rate": 1.442175978584045e-05, "loss": 0.0747, "step": 7683 }, { "epoch": 0.5982385285387573, "grad_norm": 0.7074366426369015, "learning_rate": 1.4416966245857184e-05, "loss": 0.0819, "step": 7684 }, { "epoch": 0.5983163836309668, "grad_norm": 0.7919467453048837, "learning_rate": 1.4412173053670678e-05, "loss": 0.1001, "step": 7685 }, { "epoch": 0.5983942387231765, "grad_norm": 0.7411270299235078, "learning_rate": 1.4407380209579526e-05, "loss": 0.0806, "step": 7686 }, { "epoch": 0.5984720938153861, "grad_norm": 0.7305214079682255, "learning_rate": 1.44025877138823e-05, "loss": 0.0921, "step": 7687 }, { "epoch": 0.5985499489075957, "grad_norm": 0.7134900937444876, "learning_rate": 1.4397795566877545e-05, "loss": 0.0832, "step": 7688 }, { "epoch": 0.5986278039998053, "grad_norm": 0.6787523244701342, "learning_rate": 1.4393003768863798e-05, "loss": 0.0636, "step": 7689 }, { "epoch": 0.598705659092015, "grad_norm": 0.6529759157383802, "learning_rate": 1.438821232013956e-05, "loss": 0.0767, "step": 7690 }, { "epoch": 0.5987835141842246, "grad_norm": 0.7141912998266877, "learning_rate": 1.4383421221003318e-05, "loss": 0.0682, "step": 7691 }, { "epoch": 0.5988613692764342, "grad_norm": 0.7073521128149751, "learning_rate": 1.4378630471753531e-05, "loss": 0.0784, "step": 7692 }, { "epoch": 0.5989392243686439, "grad_norm": 0.6699338578092524, "learning_rate": 1.4373840072688641e-05, "loss": 0.0815, "step": 7693 }, { "epoch": 0.5990170794608535, "grad_norm": 0.7092734653699467, "learning_rate": 1.4369050024107072e-05, "loss": 0.0875, "step": 7694 }, { "epoch": 0.5990949345530631, "grad_norm": 0.6947321416285848, "learning_rate": 1.4364260326307218e-05, "loss": 0.0941, "step": 7695 }, { "epoch": 0.5991727896452728, "grad_norm": 0.7292219610034678, "learning_rate": 1.4359470979587457e-05, "loss": 0.0673, "step": 7696 }, { "epoch": 0.5992506447374824, "grad_norm": 0.8200985582074436, "learning_rate": 1.4354681984246142e-05, "loss": 0.1086, "step": 7697 }, { "epoch": 0.5993284998296919, "grad_norm": 0.7146788845268177, "learning_rate": 1.4349893340581598e-05, "loss": 0.0593, "step": 7698 }, { "epoch": 0.5994063549219016, "grad_norm": 0.779013902717448, "learning_rate": 1.4345105048892146e-05, "loss": 0.091, "step": 7699 }, { "epoch": 0.5994842100141112, "grad_norm": 0.7478871539740916, "learning_rate": 1.434031710947607e-05, "loss": 0.0654, "step": 7700 }, { "epoch": 0.5994842100141112, "eval_loss": 0.009844648651778698, "eval_runtime": 162.4189, "eval_samples_per_second": 17.732, "eval_steps_per_second": 0.634, "step": 7700 }, { "epoch": 0.5995620651063208, "grad_norm": 0.7442561343700592, "learning_rate": 1.4335529522631632e-05, "loss": 0.0682, "step": 7701 }, { "epoch": 0.5996399201985305, "grad_norm": 0.7533615807596451, "learning_rate": 1.4330742288657083e-05, "loss": 0.1062, "step": 7702 }, { "epoch": 0.5997177752907401, "grad_norm": 0.6285816329800665, "learning_rate": 1.4325955407850636e-05, "loss": 0.0745, "step": 7703 }, { "epoch": 0.5997956303829497, "grad_norm": 0.7520505506595673, "learning_rate": 1.4321168880510502e-05, "loss": 0.0611, "step": 7704 }, { "epoch": 0.5998734854751594, "grad_norm": 0.7208712450533109, "learning_rate": 1.4316382706934851e-05, "loss": 0.0789, "step": 7705 }, { "epoch": 0.599951340567369, "grad_norm": 0.8066406933592066, "learning_rate": 1.4311596887421844e-05, "loss": 0.0797, "step": 7706 }, { "epoch": 0.6000291956595786, "grad_norm": 0.7222297745906782, "learning_rate": 1.4306811422269612e-05, "loss": 0.064, "step": 7707 }, { "epoch": 0.6001070507517883, "grad_norm": 0.7062370702828187, "learning_rate": 1.4302026311776276e-05, "loss": 0.0748, "step": 7708 }, { "epoch": 0.6001849058439979, "grad_norm": 0.8008354898928923, "learning_rate": 1.42972415562399e-05, "loss": 0.0827, "step": 7709 }, { "epoch": 0.6002627609362075, "grad_norm": 0.6820351304784675, "learning_rate": 1.4292457155958581e-05, "loss": 0.0762, "step": 7710 }, { "epoch": 0.6003406160284172, "grad_norm": 0.6471335927122218, "learning_rate": 1.4287673111230354e-05, "loss": 0.0589, "step": 7711 }, { "epoch": 0.6004184711206267, "grad_norm": 0.6694410016296747, "learning_rate": 1.4282889422353238e-05, "loss": 0.075, "step": 7712 }, { "epoch": 0.6004963262128363, "grad_norm": 0.6541034484977852, "learning_rate": 1.4278106089625245e-05, "loss": 0.0732, "step": 7713 }, { "epoch": 0.600574181305046, "grad_norm": 0.6638802940942752, "learning_rate": 1.4273323113344337e-05, "loss": 0.063, "step": 7714 }, { "epoch": 0.6006520363972556, "grad_norm": 0.691443379009276, "learning_rate": 1.4268540493808487e-05, "loss": 0.061, "step": 7715 }, { "epoch": 0.6007298914894652, "grad_norm": 0.7570789100313882, "learning_rate": 1.4263758231315625e-05, "loss": 0.0933, "step": 7716 }, { "epoch": 0.6008077465816749, "grad_norm": 0.6545684323607956, "learning_rate": 1.4258976326163665e-05, "loss": 0.0715, "step": 7717 }, { "epoch": 0.6008856016738845, "grad_norm": 0.6118313714238122, "learning_rate": 1.4254194778650492e-05, "loss": 0.0465, "step": 7718 }, { "epoch": 0.6009634567660941, "grad_norm": 0.6298587517105256, "learning_rate": 1.4249413589073968e-05, "loss": 0.0846, "step": 7719 }, { "epoch": 0.6010413118583038, "grad_norm": 0.7495112295147627, "learning_rate": 1.4244632757731958e-05, "loss": 0.0895, "step": 7720 }, { "epoch": 0.6011191669505134, "grad_norm": 0.734039881948161, "learning_rate": 1.423985228492227e-05, "loss": 0.0757, "step": 7721 }, { "epoch": 0.601197022042723, "grad_norm": 0.7161443932808221, "learning_rate": 1.4235072170942716e-05, "loss": 0.0866, "step": 7722 }, { "epoch": 0.6012748771349327, "grad_norm": 0.682666740138009, "learning_rate": 1.4230292416091062e-05, "loss": 0.0743, "step": 7723 }, { "epoch": 0.6013527322271423, "grad_norm": 0.7839828582005932, "learning_rate": 1.4225513020665065e-05, "loss": 0.0908, "step": 7724 }, { "epoch": 0.6014305873193518, "grad_norm": 0.7005911112273469, "learning_rate": 1.4220733984962474e-05, "loss": 0.0854, "step": 7725 }, { "epoch": 0.6015084424115614, "grad_norm": 0.7427221188341596, "learning_rate": 1.4215955309280993e-05, "loss": 0.086, "step": 7726 }, { "epoch": 0.6015862975037711, "grad_norm": 0.7624505928413753, "learning_rate": 1.4211176993918303e-05, "loss": 0.0924, "step": 7727 }, { "epoch": 0.6016641525959807, "grad_norm": 0.7098529448106528, "learning_rate": 1.4206399039172078e-05, "loss": 0.0544, "step": 7728 }, { "epoch": 0.6017420076881903, "grad_norm": 0.5797290663271975, "learning_rate": 1.4201621445339951e-05, "loss": 0.0446, "step": 7729 }, { "epoch": 0.6018198627804, "grad_norm": 0.7631947785466244, "learning_rate": 1.4196844212719563e-05, "loss": 0.1082, "step": 7730 }, { "epoch": 0.6018977178726096, "grad_norm": 0.6187408559947107, "learning_rate": 1.4192067341608507e-05, "loss": 0.0545, "step": 7731 }, { "epoch": 0.6019755729648192, "grad_norm": 0.8150784856732, "learning_rate": 1.418729083230435e-05, "loss": 0.1152, "step": 7732 }, { "epoch": 0.6020534280570289, "grad_norm": 0.695036455288231, "learning_rate": 1.4182514685104656e-05, "loss": 0.0579, "step": 7733 }, { "epoch": 0.6021312831492385, "grad_norm": 0.8019173739140573, "learning_rate": 1.4177738900306941e-05, "loss": 0.0784, "step": 7734 }, { "epoch": 0.6022091382414481, "grad_norm": 0.7672095100991512, "learning_rate": 1.4172963478208739e-05, "loss": 0.0673, "step": 7735 }, { "epoch": 0.6022869933336578, "grad_norm": 0.7345163457126721, "learning_rate": 1.4168188419107518e-05, "loss": 0.0485, "step": 7736 }, { "epoch": 0.6023648484258673, "grad_norm": 0.6529394970476132, "learning_rate": 1.4163413723300747e-05, "loss": 0.074, "step": 7737 }, { "epoch": 0.6024427035180769, "grad_norm": 0.8208153837823525, "learning_rate": 1.4158639391085866e-05, "loss": 0.0919, "step": 7738 }, { "epoch": 0.6025205586102866, "grad_norm": 0.7604176132554028, "learning_rate": 1.415386542276029e-05, "loss": 0.0971, "step": 7739 }, { "epoch": 0.6025984137024962, "grad_norm": 0.6426876207907808, "learning_rate": 1.4149091818621426e-05, "loss": 0.0508, "step": 7740 }, { "epoch": 0.6026762687947058, "grad_norm": 0.6959619889857527, "learning_rate": 1.4144318578966635e-05, "loss": 0.0869, "step": 7741 }, { "epoch": 0.6027541238869155, "grad_norm": 0.7706902775424292, "learning_rate": 1.4139545704093278e-05, "loss": 0.0938, "step": 7742 }, { "epoch": 0.6028319789791251, "grad_norm": 0.6912460428402909, "learning_rate": 1.4134773194298674e-05, "loss": 0.0754, "step": 7743 }, { "epoch": 0.6029098340713347, "grad_norm": 0.76026882862196, "learning_rate": 1.4130001049880128e-05, "loss": 0.0871, "step": 7744 }, { "epoch": 0.6029876891635444, "grad_norm": 0.6076635515124643, "learning_rate": 1.4125229271134925e-05, "loss": 0.0482, "step": 7745 }, { "epoch": 0.603065544255754, "grad_norm": 0.6690635516037904, "learning_rate": 1.4120457858360328e-05, "loss": 0.0689, "step": 7746 }, { "epoch": 0.6031433993479636, "grad_norm": 0.6213094136403748, "learning_rate": 1.4115686811853571e-05, "loss": 0.0495, "step": 7747 }, { "epoch": 0.6032212544401733, "grad_norm": 0.7121574774437479, "learning_rate": 1.4110916131911867e-05, "loss": 0.0997, "step": 7748 }, { "epoch": 0.6032991095323829, "grad_norm": 0.7592886493097757, "learning_rate": 1.4106145818832407e-05, "loss": 0.1098, "step": 7749 }, { "epoch": 0.6033769646245924, "grad_norm": 0.63305809736848, "learning_rate": 1.410137587291235e-05, "loss": 0.0772, "step": 7750 }, { "epoch": 0.6033769646245924, "eval_loss": 0.009689838625490665, "eval_runtime": 162.2648, "eval_samples_per_second": 17.749, "eval_steps_per_second": 0.635, "step": 7750 }, { "epoch": 0.6034548197168021, "grad_norm": 0.7056636418015343, "learning_rate": 1.4096606294448857e-05, "loss": 0.0845, "step": 7751 }, { "epoch": 0.6035326748090117, "grad_norm": 0.6660033241912812, "learning_rate": 1.4091837083739047e-05, "loss": 0.0706, "step": 7752 }, { "epoch": 0.6036105299012213, "grad_norm": 0.7011958414210692, "learning_rate": 1.4087068241080014e-05, "loss": 0.0784, "step": 7753 }, { "epoch": 0.603688384993431, "grad_norm": 0.6751394711565547, "learning_rate": 1.4082299766768835e-05, "loss": 0.0923, "step": 7754 }, { "epoch": 0.6037662400856406, "grad_norm": 0.6548139511676841, "learning_rate": 1.407753166110256e-05, "loss": 0.0585, "step": 7755 }, { "epoch": 0.6038440951778502, "grad_norm": 0.8225233365447112, "learning_rate": 1.407276392437823e-05, "loss": 0.0795, "step": 7756 }, { "epoch": 0.6039219502700599, "grad_norm": 0.7175671372907952, "learning_rate": 1.4067996556892847e-05, "loss": 0.0739, "step": 7757 }, { "epoch": 0.6039998053622695, "grad_norm": 0.5757823341337487, "learning_rate": 1.4063229558943397e-05, "loss": 0.0453, "step": 7758 }, { "epoch": 0.6040776604544791, "grad_norm": 0.7848278307518223, "learning_rate": 1.4058462930826837e-05, "loss": 0.095, "step": 7759 }, { "epoch": 0.6041555155466887, "grad_norm": 0.7047281120510439, "learning_rate": 1.4053696672840106e-05, "loss": 0.0865, "step": 7760 }, { "epoch": 0.6042333706388984, "grad_norm": 0.684149696165312, "learning_rate": 1.4048930785280126e-05, "loss": 0.0763, "step": 7761 }, { "epoch": 0.604311225731108, "grad_norm": 0.7918672217304644, "learning_rate": 1.4044165268443788e-05, "loss": 0.102, "step": 7762 }, { "epoch": 0.6043890808233175, "grad_norm": 0.7250270663934136, "learning_rate": 1.4039400122627958e-05, "loss": 0.1088, "step": 7763 }, { "epoch": 0.6044669359155272, "grad_norm": 0.7367944475205506, "learning_rate": 1.403463534812948e-05, "loss": 0.0942, "step": 7764 }, { "epoch": 0.6045447910077368, "grad_norm": 0.6143044356262124, "learning_rate": 1.4029870945245176e-05, "loss": 0.0577, "step": 7765 }, { "epoch": 0.6046226460999464, "grad_norm": 0.6566241945949494, "learning_rate": 1.4025106914271858e-05, "loss": 0.0779, "step": 7766 }, { "epoch": 0.6047005011921561, "grad_norm": 0.791484075421282, "learning_rate": 1.4020343255506292e-05, "loss": 0.1396, "step": 7767 }, { "epoch": 0.6047783562843657, "grad_norm": 0.6264942107625322, "learning_rate": 1.4015579969245237e-05, "loss": 0.0633, "step": 7768 }, { "epoch": 0.6048562113765753, "grad_norm": 0.5833329705963063, "learning_rate": 1.401081705578542e-05, "loss": 0.0661, "step": 7769 }, { "epoch": 0.604934066468785, "grad_norm": 0.6800251542655287, "learning_rate": 1.4006054515423542e-05, "loss": 0.0712, "step": 7770 }, { "epoch": 0.6050119215609946, "grad_norm": 0.7524716231349994, "learning_rate": 1.4001292348456299e-05, "loss": 0.0879, "step": 7771 }, { "epoch": 0.6050897766532042, "grad_norm": 0.6480019199626196, "learning_rate": 1.3996530555180348e-05, "loss": 0.0675, "step": 7772 }, { "epoch": 0.6051676317454139, "grad_norm": 0.9753197752788312, "learning_rate": 1.3991769135892322e-05, "loss": 0.0815, "step": 7773 }, { "epoch": 0.6052454868376235, "grad_norm": 0.6333554267018041, "learning_rate": 1.3987008090888838e-05, "loss": 0.0712, "step": 7774 }, { "epoch": 0.6053233419298331, "grad_norm": 0.8135841450947837, "learning_rate": 1.3982247420466486e-05, "loss": 0.0828, "step": 7775 }, { "epoch": 0.6054011970220428, "grad_norm": 0.6575198980804027, "learning_rate": 1.3977487124921834e-05, "loss": 0.0862, "step": 7776 }, { "epoch": 0.6054790521142523, "grad_norm": 0.741649760931235, "learning_rate": 1.3972727204551426e-05, "loss": 0.0955, "step": 7777 }, { "epoch": 0.6055569072064619, "grad_norm": 0.6074844075552921, "learning_rate": 1.3967967659651784e-05, "loss": 0.0553, "step": 7778 }, { "epoch": 0.6056347622986716, "grad_norm": 0.6433104916434339, "learning_rate": 1.3963208490519406e-05, "loss": 0.0599, "step": 7779 }, { "epoch": 0.6057126173908812, "grad_norm": 0.7889218535940029, "learning_rate": 1.3958449697450753e-05, "loss": 0.0722, "step": 7780 }, { "epoch": 0.6057904724830908, "grad_norm": 0.6124224909145409, "learning_rate": 1.3953691280742294e-05, "loss": 0.0631, "step": 7781 }, { "epoch": 0.6058683275753005, "grad_norm": 0.6812899421137721, "learning_rate": 1.3948933240690448e-05, "loss": 0.0511, "step": 7782 }, { "epoch": 0.6059461826675101, "grad_norm": 0.6966202412250811, "learning_rate": 1.3944175577591619e-05, "loss": 0.0758, "step": 7783 }, { "epoch": 0.6060240377597197, "grad_norm": 0.7118271329874277, "learning_rate": 1.3939418291742191e-05, "loss": 0.0792, "step": 7784 }, { "epoch": 0.6061018928519294, "grad_norm": 0.5837353220485529, "learning_rate": 1.393466138343851e-05, "loss": 0.0496, "step": 7785 }, { "epoch": 0.606179747944139, "grad_norm": 0.6153006810882504, "learning_rate": 1.3929904852976907e-05, "loss": 0.0577, "step": 7786 }, { "epoch": 0.6062576030363486, "grad_norm": 0.8355983261857344, "learning_rate": 1.392514870065371e-05, "loss": 0.0886, "step": 7787 }, { "epoch": 0.6063354581285583, "grad_norm": 0.6961178688874938, "learning_rate": 1.3920392926765198e-05, "loss": 0.0691, "step": 7788 }, { "epoch": 0.6064133132207679, "grad_norm": 0.7003327255531621, "learning_rate": 1.3915637531607626e-05, "loss": 0.0818, "step": 7789 }, { "epoch": 0.6064911683129774, "grad_norm": 0.6669024943085644, "learning_rate": 1.3910882515477235e-05, "loss": 0.0766, "step": 7790 }, { "epoch": 0.6065690234051871, "grad_norm": 0.6776145158374901, "learning_rate": 1.3906127878670237e-05, "loss": 0.0875, "step": 7791 }, { "epoch": 0.6066468784973967, "grad_norm": 0.7009792014895825, "learning_rate": 1.3901373621482834e-05, "loss": 0.0697, "step": 7792 }, { "epoch": 0.6067247335896063, "grad_norm": 0.6212002049749938, "learning_rate": 1.3896619744211193e-05, "loss": 0.0598, "step": 7793 }, { "epoch": 0.606802588681816, "grad_norm": 0.6309738375258357, "learning_rate": 1.3891866247151451e-05, "loss": 0.0661, "step": 7794 }, { "epoch": 0.6068804437740256, "grad_norm": 0.6930811115412704, "learning_rate": 1.388711313059973e-05, "loss": 0.0767, "step": 7795 }, { "epoch": 0.6069582988662352, "grad_norm": 0.6453010627239362, "learning_rate": 1.3882360394852116e-05, "loss": 0.0762, "step": 7796 }, { "epoch": 0.6070361539584448, "grad_norm": 0.7509380419182456, "learning_rate": 1.3877608040204713e-05, "loss": 0.0732, "step": 7797 }, { "epoch": 0.6071140090506545, "grad_norm": 0.7082451509553541, "learning_rate": 1.3872856066953543e-05, "loss": 0.1013, "step": 7798 }, { "epoch": 0.6071918641428641, "grad_norm": 0.6495850149398565, "learning_rate": 1.3868104475394639e-05, "loss": 0.0645, "step": 7799 }, { "epoch": 0.6072697192350737, "grad_norm": 0.6500797453113683, "learning_rate": 1.3863353265824006e-05, "loss": 0.0731, "step": 7800 }, { "epoch": 0.6072697192350737, "eval_loss": 0.009523801505565643, "eval_runtime": 162.9121, "eval_samples_per_second": 17.678, "eval_steps_per_second": 0.632, "step": 7800 }, { "epoch": 0.6073475743272834, "grad_norm": 0.7568875992093808, "learning_rate": 1.3858602438537612e-05, "loss": 0.0683, "step": 7801 }, { "epoch": 0.607425429419493, "grad_norm": 0.7560138728735054, "learning_rate": 1.3853851993831428e-05, "loss": 0.1108, "step": 7802 }, { "epoch": 0.6075032845117025, "grad_norm": 0.6390787344939465, "learning_rate": 1.3849101932001374e-05, "loss": 0.0514, "step": 7803 }, { "epoch": 0.6075811396039122, "grad_norm": 0.7453048383156988, "learning_rate": 1.3844352253343361e-05, "loss": 0.1131, "step": 7804 }, { "epoch": 0.6076589946961218, "grad_norm": 0.6539936478208686, "learning_rate": 1.3839602958153263e-05, "loss": 0.0575, "step": 7805 }, { "epoch": 0.6077368497883314, "grad_norm": 0.6425691816642132, "learning_rate": 1.3834854046726941e-05, "loss": 0.0546, "step": 7806 }, { "epoch": 0.6078147048805411, "grad_norm": 0.7776090809352106, "learning_rate": 1.3830105519360237e-05, "loss": 0.1177, "step": 7807 }, { "epoch": 0.6078925599727507, "grad_norm": 0.6536394694077997, "learning_rate": 1.3825357376348962e-05, "loss": 0.081, "step": 7808 }, { "epoch": 0.6079704150649603, "grad_norm": 0.7474975033767273, "learning_rate": 1.3820609617988896e-05, "loss": 0.0877, "step": 7809 }, { "epoch": 0.60804827015717, "grad_norm": 0.7744800200630096, "learning_rate": 1.3815862244575805e-05, "loss": 0.0905, "step": 7810 }, { "epoch": 0.6081261252493796, "grad_norm": 0.7181427325694989, "learning_rate": 1.3811115256405425e-05, "loss": 0.0796, "step": 7811 }, { "epoch": 0.6082039803415892, "grad_norm": 0.6830794937962327, "learning_rate": 1.3806368653773476e-05, "loss": 0.0759, "step": 7812 }, { "epoch": 0.6082818354337989, "grad_norm": 0.718070661482459, "learning_rate": 1.3801622436975653e-05, "loss": 0.0799, "step": 7813 }, { "epoch": 0.6083596905260085, "grad_norm": 0.6210505580750363, "learning_rate": 1.379687660630761e-05, "loss": 0.0587, "step": 7814 }, { "epoch": 0.6084375456182181, "grad_norm": 0.6728366553534958, "learning_rate": 1.3792131162065002e-05, "loss": 0.0577, "step": 7815 }, { "epoch": 0.6085154007104278, "grad_norm": 0.6689759975617238, "learning_rate": 1.3787386104543438e-05, "loss": 0.0569, "step": 7816 }, { "epoch": 0.6085932558026373, "grad_norm": 0.7100917181191542, "learning_rate": 1.3782641434038522e-05, "loss": 0.0856, "step": 7817 }, { "epoch": 0.6086711108948469, "grad_norm": 0.7145001178012862, "learning_rate": 1.3777897150845818e-05, "loss": 0.0678, "step": 7818 }, { "epoch": 0.6087489659870566, "grad_norm": 0.7078756055311385, "learning_rate": 1.3773153255260879e-05, "loss": 0.0823, "step": 7819 }, { "epoch": 0.6088268210792662, "grad_norm": 0.6228677839278809, "learning_rate": 1.376840974757922e-05, "loss": 0.0633, "step": 7820 }, { "epoch": 0.6089046761714758, "grad_norm": 0.6828379452222443, "learning_rate": 1.3763666628096344e-05, "loss": 0.0673, "step": 7821 }, { "epoch": 0.6089825312636855, "grad_norm": 0.6498810701095302, "learning_rate": 1.3758923897107718e-05, "loss": 0.0566, "step": 7822 }, { "epoch": 0.6090603863558951, "grad_norm": 0.6649998770932473, "learning_rate": 1.3754181554908805e-05, "loss": 0.0732, "step": 7823 }, { "epoch": 0.6091382414481047, "grad_norm": 0.6927971208210463, "learning_rate": 1.3749439601795021e-05, "loss": 0.0767, "step": 7824 }, { "epoch": 0.6092160965403144, "grad_norm": 0.7095893079388786, "learning_rate": 1.374469803806177e-05, "loss": 0.0815, "step": 7825 }, { "epoch": 0.609293951632524, "grad_norm": 0.7216725109066942, "learning_rate": 1.3739956864004428e-05, "loss": 0.0795, "step": 7826 }, { "epoch": 0.6093718067247336, "grad_norm": 0.7470208433658537, "learning_rate": 1.3735216079918345e-05, "loss": 0.087, "step": 7827 }, { "epoch": 0.6094496618169433, "grad_norm": 0.6199557177670474, "learning_rate": 1.3730475686098859e-05, "loss": 0.0561, "step": 7828 }, { "epoch": 0.6095275169091529, "grad_norm": 0.6929431798576317, "learning_rate": 1.3725735682841266e-05, "loss": 0.0649, "step": 7829 }, { "epoch": 0.6096053720013624, "grad_norm": 0.6794932596151056, "learning_rate": 1.372099607044085e-05, "loss": 0.078, "step": 7830 }, { "epoch": 0.6096832270935721, "grad_norm": 0.7045441596959114, "learning_rate": 1.3716256849192863e-05, "loss": 0.0781, "step": 7831 }, { "epoch": 0.6097610821857817, "grad_norm": 0.771349727326555, "learning_rate": 1.3711518019392536e-05, "loss": 0.0663, "step": 7832 }, { "epoch": 0.6098389372779913, "grad_norm": 0.6765048688796433, "learning_rate": 1.370677958133508e-05, "loss": 0.0663, "step": 7833 }, { "epoch": 0.6099167923702009, "grad_norm": 0.6086028255498546, "learning_rate": 1.3702041535315677e-05, "loss": 0.0605, "step": 7834 }, { "epoch": 0.6099946474624106, "grad_norm": 0.6773484472566899, "learning_rate": 1.3697303881629483e-05, "loss": 0.0627, "step": 7835 }, { "epoch": 0.6100725025546202, "grad_norm": 0.6402431772724754, "learning_rate": 1.369256662057163e-05, "loss": 0.0632, "step": 7836 }, { "epoch": 0.6101503576468298, "grad_norm": 0.6979837197304196, "learning_rate": 1.3687829752437227e-05, "loss": 0.0886, "step": 7837 }, { "epoch": 0.6102282127390395, "grad_norm": 0.6798294744682415, "learning_rate": 1.3683093277521367e-05, "loss": 0.0734, "step": 7838 }, { "epoch": 0.6103060678312491, "grad_norm": 0.6623450715280441, "learning_rate": 1.3678357196119098e-05, "loss": 0.0753, "step": 7839 }, { "epoch": 0.6103839229234587, "grad_norm": 0.6948367062163905, "learning_rate": 1.3673621508525466e-05, "loss": 0.081, "step": 7840 }, { "epoch": 0.6104617780156684, "grad_norm": 0.6574491618201801, "learning_rate": 1.3668886215035479e-05, "loss": 0.0668, "step": 7841 }, { "epoch": 0.610539633107878, "grad_norm": 0.7174872752736077, "learning_rate": 1.3664151315944109e-05, "loss": 0.0704, "step": 7842 }, { "epoch": 0.6106174882000875, "grad_norm": 0.6801865308544908, "learning_rate": 1.3659416811546341e-05, "loss": 0.0925, "step": 7843 }, { "epoch": 0.6106953432922972, "grad_norm": 0.737577695650602, "learning_rate": 1.3654682702137103e-05, "loss": 0.068, "step": 7844 }, { "epoch": 0.6107731983845068, "grad_norm": 0.6674681125313934, "learning_rate": 1.3649948988011306e-05, "loss": 0.0627, "step": 7845 }, { "epoch": 0.6108510534767164, "grad_norm": 0.6136230025659898, "learning_rate": 1.3645215669463842e-05, "loss": 0.0602, "step": 7846 }, { "epoch": 0.6109289085689261, "grad_norm": 0.6927914221827955, "learning_rate": 1.3640482746789561e-05, "loss": 0.0835, "step": 7847 }, { "epoch": 0.6110067636611357, "grad_norm": 0.689959738816252, "learning_rate": 1.3635750220283321e-05, "loss": 0.0783, "step": 7848 }, { "epoch": 0.6110846187533453, "grad_norm": 0.6590951638627052, "learning_rate": 1.3631018090239927e-05, "loss": 0.069, "step": 7849 }, { "epoch": 0.611162473845555, "grad_norm": 0.5903787851630233, "learning_rate": 1.3626286356954169e-05, "loss": 0.0488, "step": 7850 }, { "epoch": 0.611162473845555, "eval_loss": 0.00944290217012167, "eval_runtime": 162.838, "eval_samples_per_second": 17.686, "eval_steps_per_second": 0.633, "step": 7850 }, { "epoch": 0.6112403289377646, "grad_norm": 0.597464823007736, "learning_rate": 1.3621555020720813e-05, "loss": 0.059, "step": 7851 }, { "epoch": 0.6113181840299742, "grad_norm": 0.6425139004604532, "learning_rate": 1.361682408183459e-05, "loss": 0.0545, "step": 7852 }, { "epoch": 0.6113960391221839, "grad_norm": 0.6804322362670917, "learning_rate": 1.361209354059023e-05, "loss": 0.0744, "step": 7853 }, { "epoch": 0.6114738942143935, "grad_norm": 0.6929654249675761, "learning_rate": 1.3607363397282416e-05, "loss": 0.0643, "step": 7854 }, { "epoch": 0.611551749306603, "grad_norm": 0.6681742080470977, "learning_rate": 1.360263365220582e-05, "loss": 0.0699, "step": 7855 }, { "epoch": 0.6116296043988128, "grad_norm": 0.6533870778272465, "learning_rate": 1.3597904305655071e-05, "loss": 0.0727, "step": 7856 }, { "epoch": 0.6117074594910223, "grad_norm": 0.6626338137246524, "learning_rate": 1.3593175357924784e-05, "loss": 0.0657, "step": 7857 }, { "epoch": 0.6117853145832319, "grad_norm": 0.6418784112815655, "learning_rate": 1.3588446809309565e-05, "loss": 0.0607, "step": 7858 }, { "epoch": 0.6118631696754416, "grad_norm": 0.7244746218466631, "learning_rate": 1.3583718660103973e-05, "loss": 0.0997, "step": 7859 }, { "epoch": 0.6119410247676512, "grad_norm": 0.6146723875571726, "learning_rate": 1.3578990910602556e-05, "loss": 0.0527, "step": 7860 }, { "epoch": 0.6120188798598608, "grad_norm": 0.7113591766230931, "learning_rate": 1.3574263561099817e-05, "loss": 0.0685, "step": 7861 }, { "epoch": 0.6120967349520705, "grad_norm": 0.7358233194842878, "learning_rate": 1.3569536611890255e-05, "loss": 0.0774, "step": 7862 }, { "epoch": 0.6121745900442801, "grad_norm": 0.7647404488297979, "learning_rate": 1.3564810063268328e-05, "loss": 0.1149, "step": 7863 }, { "epoch": 0.6122524451364897, "grad_norm": 0.7262491803480743, "learning_rate": 1.35600839155285e-05, "loss": 0.0971, "step": 7864 }, { "epoch": 0.6123303002286994, "grad_norm": 0.6012657589538776, "learning_rate": 1.3555358168965164e-05, "loss": 0.0641, "step": 7865 }, { "epoch": 0.612408155320909, "grad_norm": 0.7500198975934225, "learning_rate": 1.3550632823872726e-05, "loss": 0.0837, "step": 7866 }, { "epoch": 0.6124860104131186, "grad_norm": 0.7123652283374813, "learning_rate": 1.354590788054555e-05, "loss": 0.0547, "step": 7867 }, { "epoch": 0.6125638655053282, "grad_norm": 0.6673462146859768, "learning_rate": 1.3541183339277966e-05, "loss": 0.074, "step": 7868 }, { "epoch": 0.6126417205975379, "grad_norm": 0.630709545535068, "learning_rate": 1.3536459200364313e-05, "loss": 0.0579, "step": 7869 }, { "epoch": 0.6127195756897474, "grad_norm": 0.7552409863253609, "learning_rate": 1.3531735464098868e-05, "loss": 0.1067, "step": 7870 }, { "epoch": 0.612797430781957, "grad_norm": 0.6317887861156514, "learning_rate": 1.3527012130775896e-05, "loss": 0.0755, "step": 7871 }, { "epoch": 0.6128752858741667, "grad_norm": 0.727671670509188, "learning_rate": 1.3522289200689647e-05, "loss": 0.0774, "step": 7872 }, { "epoch": 0.6129531409663763, "grad_norm": 0.6721837728042371, "learning_rate": 1.3517566674134327e-05, "loss": 0.0695, "step": 7873 }, { "epoch": 0.6130309960585859, "grad_norm": 0.7005104898801197, "learning_rate": 1.3512844551404136e-05, "loss": 0.0664, "step": 7874 }, { "epoch": 0.6131088511507956, "grad_norm": 0.6952423130107761, "learning_rate": 1.3508122832793238e-05, "loss": 0.0602, "step": 7875 }, { "epoch": 0.6131867062430052, "grad_norm": 0.6663328755580973, "learning_rate": 1.3503401518595774e-05, "loss": 0.0495, "step": 7876 }, { "epoch": 0.6132645613352148, "grad_norm": 0.6890214370683662, "learning_rate": 1.3498680609105858e-05, "loss": 0.0892, "step": 7877 }, { "epoch": 0.6133424164274245, "grad_norm": 0.7901785255715974, "learning_rate": 1.3493960104617577e-05, "loss": 0.1117, "step": 7878 }, { "epoch": 0.6134202715196341, "grad_norm": 0.6604147752960501, "learning_rate": 1.3489240005425004e-05, "loss": 0.0571, "step": 7879 }, { "epoch": 0.6134981266118437, "grad_norm": 0.6153336878070021, "learning_rate": 1.3484520311822175e-05, "loss": 0.0682, "step": 7880 }, { "epoch": 0.6135759817040534, "grad_norm": 0.6062294704134114, "learning_rate": 1.3479801024103107e-05, "loss": 0.0634, "step": 7881 }, { "epoch": 0.613653836796263, "grad_norm": 0.72857019110613, "learning_rate": 1.3475082142561786e-05, "loss": 0.0942, "step": 7882 }, { "epoch": 0.6137316918884725, "grad_norm": 0.6133589771338734, "learning_rate": 1.3470363667492174e-05, "loss": 0.0646, "step": 7883 }, { "epoch": 0.6138095469806822, "grad_norm": 0.7881047886200856, "learning_rate": 1.3465645599188217e-05, "loss": 0.1215, "step": 7884 }, { "epoch": 0.6138874020728918, "grad_norm": 0.5432070496316088, "learning_rate": 1.3460927937943827e-05, "loss": 0.0433, "step": 7885 }, { "epoch": 0.6139652571651014, "grad_norm": 0.6727459893598547, "learning_rate": 1.3456210684052889e-05, "loss": 0.0669, "step": 7886 }, { "epoch": 0.6140431122573111, "grad_norm": 0.69769953435176, "learning_rate": 1.3451493837809264e-05, "loss": 0.0811, "step": 7887 }, { "epoch": 0.6141209673495207, "grad_norm": 0.6282385974770426, "learning_rate": 1.3446777399506791e-05, "loss": 0.0838, "step": 7888 }, { "epoch": 0.6141988224417303, "grad_norm": 0.6353725454579701, "learning_rate": 1.3442061369439288e-05, "loss": 0.0754, "step": 7889 }, { "epoch": 0.61427667753394, "grad_norm": 0.5885536160186965, "learning_rate": 1.3437345747900535e-05, "loss": 0.0632, "step": 7890 }, { "epoch": 0.6143545326261496, "grad_norm": 0.7404541744677137, "learning_rate": 1.3432630535184296e-05, "loss": 0.0834, "step": 7891 }, { "epoch": 0.6144323877183592, "grad_norm": 0.6896370718813526, "learning_rate": 1.3427915731584305e-05, "loss": 0.0876, "step": 7892 }, { "epoch": 0.6145102428105689, "grad_norm": 0.6342611822278449, "learning_rate": 1.3423201337394266e-05, "loss": 0.0619, "step": 7893 }, { "epoch": 0.6145880979027785, "grad_norm": 0.6737441692867094, "learning_rate": 1.3418487352907876e-05, "loss": 0.0961, "step": 7894 }, { "epoch": 0.614665952994988, "grad_norm": 0.6568784502957769, "learning_rate": 1.3413773778418789e-05, "loss": 0.0736, "step": 7895 }, { "epoch": 0.6147438080871978, "grad_norm": 0.6403386417817278, "learning_rate": 1.3409060614220637e-05, "loss": 0.0746, "step": 7896 }, { "epoch": 0.6148216631794073, "grad_norm": 0.6736711945125514, "learning_rate": 1.340434786060703e-05, "loss": 0.0656, "step": 7897 }, { "epoch": 0.6148995182716169, "grad_norm": 0.655225439510345, "learning_rate": 1.3399635517871545e-05, "loss": 0.0468, "step": 7898 }, { "epoch": 0.6149773733638266, "grad_norm": 0.6911481057581997, "learning_rate": 1.3394923586307742e-05, "loss": 0.0655, "step": 7899 }, { "epoch": 0.6150552284560362, "grad_norm": 0.6802316764309255, "learning_rate": 1.3390212066209156e-05, "loss": 0.0854, "step": 7900 }, { "epoch": 0.6150552284560362, "eval_loss": 0.009392064064741135, "eval_runtime": 162.7643, "eval_samples_per_second": 17.694, "eval_steps_per_second": 0.633, "step": 7900 }, { "epoch": 0.6151330835482458, "grad_norm": 0.640481270738281, "learning_rate": 1.3385500957869288e-05, "loss": 0.0649, "step": 7901 }, { "epoch": 0.6152109386404555, "grad_norm": 0.5450999895021523, "learning_rate": 1.3380790261581623e-05, "loss": 0.0424, "step": 7902 }, { "epoch": 0.6152887937326651, "grad_norm": 0.6040535042430879, "learning_rate": 1.3376079977639608e-05, "loss": 0.0497, "step": 7903 }, { "epoch": 0.6153666488248747, "grad_norm": 0.5622779049055894, "learning_rate": 1.3371370106336674e-05, "loss": 0.0337, "step": 7904 }, { "epoch": 0.6154445039170843, "grad_norm": 0.6617353855999256, "learning_rate": 1.3366660647966228e-05, "loss": 0.0887, "step": 7905 }, { "epoch": 0.615522359009294, "grad_norm": 0.589882991709393, "learning_rate": 1.3361951602821644e-05, "loss": 0.0673, "step": 7906 }, { "epoch": 0.6156002141015036, "grad_norm": 0.797166514369602, "learning_rate": 1.3357242971196274e-05, "loss": 0.1016, "step": 7907 }, { "epoch": 0.6156780691937132, "grad_norm": 0.681784910151223, "learning_rate": 1.3352534753383446e-05, "loss": 0.0664, "step": 7908 }, { "epoch": 0.6157559242859229, "grad_norm": 0.6854957995697925, "learning_rate": 1.3347826949676446e-05, "loss": 0.0903, "step": 7909 }, { "epoch": 0.6158337793781324, "grad_norm": 0.6563015173277883, "learning_rate": 1.3343119560368568e-05, "loss": 0.0619, "step": 7910 }, { "epoch": 0.615911634470342, "grad_norm": 0.6798709169889522, "learning_rate": 1.3338412585753048e-05, "loss": 0.0725, "step": 7911 }, { "epoch": 0.6159894895625517, "grad_norm": 0.5999487427101435, "learning_rate": 1.3333706026123117e-05, "loss": 0.0616, "step": 7912 }, { "epoch": 0.6160673446547613, "grad_norm": 0.6146926799025312, "learning_rate": 1.3328999881771965e-05, "loss": 0.0808, "step": 7913 }, { "epoch": 0.6161451997469709, "grad_norm": 0.7851007090079447, "learning_rate": 1.3324294152992754e-05, "loss": 0.1123, "step": 7914 }, { "epoch": 0.6162230548391806, "grad_norm": 0.5754195928597908, "learning_rate": 1.3319588840078645e-05, "loss": 0.0585, "step": 7915 }, { "epoch": 0.6163009099313902, "grad_norm": 0.6663961432614284, "learning_rate": 1.3314883943322754e-05, "loss": 0.0638, "step": 7916 }, { "epoch": 0.6163787650235998, "grad_norm": 0.7072335617441114, "learning_rate": 1.3310179463018171e-05, "loss": 0.0594, "step": 7917 }, { "epoch": 0.6164566201158095, "grad_norm": 0.6896545223135799, "learning_rate": 1.3305475399457963e-05, "loss": 0.0664, "step": 7918 }, { "epoch": 0.6165344752080191, "grad_norm": 0.6274493290354599, "learning_rate": 1.330077175293516e-05, "loss": 0.0567, "step": 7919 }, { "epoch": 0.6166123303002287, "grad_norm": 0.6886256693202231, "learning_rate": 1.3296068523742793e-05, "loss": 0.0894, "step": 7920 }, { "epoch": 0.6166901853924384, "grad_norm": 0.6890081066078091, "learning_rate": 1.3291365712173849e-05, "loss": 0.0761, "step": 7921 }, { "epoch": 0.616768040484648, "grad_norm": 0.6182790071578642, "learning_rate": 1.3286663318521292e-05, "loss": 0.0557, "step": 7922 }, { "epoch": 0.6168458955768575, "grad_norm": 0.6412704406362081, "learning_rate": 1.328196134307805e-05, "loss": 0.0757, "step": 7923 }, { "epoch": 0.6169237506690672, "grad_norm": 0.6791193385740112, "learning_rate": 1.3277259786137032e-05, "loss": 0.083, "step": 7924 }, { "epoch": 0.6170016057612768, "grad_norm": 0.7255999338669032, "learning_rate": 1.3272558647991134e-05, "loss": 0.092, "step": 7925 }, { "epoch": 0.6170794608534864, "grad_norm": 0.6842804936309477, "learning_rate": 1.3267857928933216e-05, "loss": 0.0605, "step": 7926 }, { "epoch": 0.6171573159456961, "grad_norm": 0.5698741669114518, "learning_rate": 1.3263157629256102e-05, "loss": 0.0666, "step": 7927 }, { "epoch": 0.6172351710379057, "grad_norm": 0.6321965251228739, "learning_rate": 1.3258457749252603e-05, "loss": 0.0788, "step": 7928 }, { "epoch": 0.6173130261301153, "grad_norm": 0.6907427673907515, "learning_rate": 1.3253758289215487e-05, "loss": 0.0631, "step": 7929 }, { "epoch": 0.617390881222325, "grad_norm": 0.6833405493129525, "learning_rate": 1.324905924943753e-05, "loss": 0.0693, "step": 7930 }, { "epoch": 0.6174687363145346, "grad_norm": 0.548549028142018, "learning_rate": 1.3244360630211453e-05, "loss": 0.0573, "step": 7931 }, { "epoch": 0.6175465914067442, "grad_norm": 0.6464220428228085, "learning_rate": 1.3239662431829949e-05, "loss": 0.0587, "step": 7932 }, { "epoch": 0.6176244464989539, "grad_norm": 0.7317601370457257, "learning_rate": 1.32349646545857e-05, "loss": 0.0643, "step": 7933 }, { "epoch": 0.6177023015911635, "grad_norm": 0.8006413509441633, "learning_rate": 1.3230267298771347e-05, "loss": 0.1162, "step": 7934 }, { "epoch": 0.617780156683373, "grad_norm": 0.6963042475417268, "learning_rate": 1.3225570364679534e-05, "loss": 0.0695, "step": 7935 }, { "epoch": 0.6178580117755827, "grad_norm": 0.6895989163969181, "learning_rate": 1.3220873852602838e-05, "loss": 0.0743, "step": 7936 }, { "epoch": 0.6179358668677923, "grad_norm": 0.6500465485032791, "learning_rate": 1.3216177762833838e-05, "loss": 0.0511, "step": 7937 }, { "epoch": 0.6180137219600019, "grad_norm": 0.6700968071879825, "learning_rate": 1.3211482095665076e-05, "loss": 0.0724, "step": 7938 }, { "epoch": 0.6180915770522115, "grad_norm": 0.6597144701759166, "learning_rate": 1.320678685138907e-05, "loss": 0.0745, "step": 7939 }, { "epoch": 0.6181694321444212, "grad_norm": 0.6467313481427664, "learning_rate": 1.3202092030298311e-05, "loss": 0.0533, "step": 7940 }, { "epoch": 0.6182472872366308, "grad_norm": 0.723711712200033, "learning_rate": 1.3197397632685265e-05, "loss": 0.0696, "step": 7941 }, { "epoch": 0.6183251423288404, "grad_norm": 0.5823594679602169, "learning_rate": 1.3192703658842374e-05, "loss": 0.0629, "step": 7942 }, { "epoch": 0.6184029974210501, "grad_norm": 0.7025904504669969, "learning_rate": 1.3188010109062047e-05, "loss": 0.0924, "step": 7943 }, { "epoch": 0.6184808525132597, "grad_norm": 0.6598846093447353, "learning_rate": 1.3183316983636671e-05, "loss": 0.0837, "step": 7944 }, { "epoch": 0.6185587076054693, "grad_norm": 0.6391550972233049, "learning_rate": 1.3178624282858603e-05, "loss": 0.053, "step": 7945 }, { "epoch": 0.618636562697679, "grad_norm": 0.6351486587614673, "learning_rate": 1.3173932007020182e-05, "loss": 0.0637, "step": 7946 }, { "epoch": 0.6187144177898886, "grad_norm": 0.7194732100422394, "learning_rate": 1.3169240156413713e-05, "loss": 0.101, "step": 7947 }, { "epoch": 0.6187922728820981, "grad_norm": 0.8203989018578303, "learning_rate": 1.3164548731331474e-05, "loss": 0.1105, "step": 7948 }, { "epoch": 0.6188701279743078, "grad_norm": 0.6342537288199482, "learning_rate": 1.3159857732065717e-05, "loss": 0.092, "step": 7949 }, { "epoch": 0.6189479830665174, "grad_norm": 0.6848465401587197, "learning_rate": 1.3155167158908672e-05, "loss": 0.0683, "step": 7950 }, { "epoch": 0.6189479830665174, "eval_loss": 0.00926894973963499, "eval_runtime": 162.5963, "eval_samples_per_second": 17.713, "eval_steps_per_second": 0.633, "step": 7950 }, { "epoch": 0.619025838158727, "grad_norm": 0.7119752642481778, "learning_rate": 1.3150477012152542e-05, "loss": 0.0752, "step": 7951 }, { "epoch": 0.6191036932509367, "grad_norm": 0.6913665965707324, "learning_rate": 1.3145787292089498e-05, "loss": 0.0816, "step": 7952 }, { "epoch": 0.6191815483431463, "grad_norm": 0.738418542721698, "learning_rate": 1.3141097999011687e-05, "loss": 0.0902, "step": 7953 }, { "epoch": 0.6192594034353559, "grad_norm": 0.6619182225638277, "learning_rate": 1.3136409133211232e-05, "loss": 0.0589, "step": 7954 }, { "epoch": 0.6193372585275656, "grad_norm": 0.6058906518791716, "learning_rate": 1.3131720694980223e-05, "loss": 0.0676, "step": 7955 }, { "epoch": 0.6194151136197752, "grad_norm": 0.5691128534491747, "learning_rate": 1.3127032684610735e-05, "loss": 0.0376, "step": 7956 }, { "epoch": 0.6194929687119848, "grad_norm": 0.661542368193641, "learning_rate": 1.3122345102394802e-05, "loss": 0.0602, "step": 7957 }, { "epoch": 0.6195708238041945, "grad_norm": 0.7272595371661809, "learning_rate": 1.3117657948624446e-05, "loss": 0.1112, "step": 7958 }, { "epoch": 0.6196486788964041, "grad_norm": 0.6783293497626145, "learning_rate": 1.3112971223591648e-05, "loss": 0.0618, "step": 7959 }, { "epoch": 0.6197265339886137, "grad_norm": 0.6322797054459441, "learning_rate": 1.3108284927588365e-05, "loss": 0.06, "step": 7960 }, { "epoch": 0.6198043890808234, "grad_norm": 0.6379565923736998, "learning_rate": 1.3103599060906543e-05, "loss": 0.0567, "step": 7961 }, { "epoch": 0.619882244173033, "grad_norm": 0.5685584226880418, "learning_rate": 1.3098913623838083e-05, "loss": 0.0551, "step": 7962 }, { "epoch": 0.6199600992652425, "grad_norm": 0.674489356273144, "learning_rate": 1.3094228616674867e-05, "loss": 0.0755, "step": 7963 }, { "epoch": 0.6200379543574522, "grad_norm": 0.7758640104544388, "learning_rate": 1.3089544039708746e-05, "loss": 0.086, "step": 7964 }, { "epoch": 0.6201158094496618, "grad_norm": 0.6477771456681731, "learning_rate": 1.3084859893231547e-05, "loss": 0.0618, "step": 7965 }, { "epoch": 0.6201936645418714, "grad_norm": 0.7022143954549522, "learning_rate": 1.3080176177535075e-05, "loss": 0.1013, "step": 7966 }, { "epoch": 0.6202715196340811, "grad_norm": 0.5558344260523164, "learning_rate": 1.3075492892911106e-05, "loss": 0.0431, "step": 7967 }, { "epoch": 0.6203493747262907, "grad_norm": 0.5877283857906183, "learning_rate": 1.3070810039651378e-05, "loss": 0.0601, "step": 7968 }, { "epoch": 0.6204272298185003, "grad_norm": 0.6106443651268667, "learning_rate": 1.3066127618047614e-05, "loss": 0.0826, "step": 7969 }, { "epoch": 0.62050508491071, "grad_norm": 0.6060143756918844, "learning_rate": 1.3061445628391502e-05, "loss": 0.0679, "step": 7970 }, { "epoch": 0.6205829400029196, "grad_norm": 0.5429279660965101, "learning_rate": 1.3056764070974718e-05, "loss": 0.0369, "step": 7971 }, { "epoch": 0.6206607950951292, "grad_norm": 0.6920736026212408, "learning_rate": 1.30520829460889e-05, "loss": 0.0794, "step": 7972 }, { "epoch": 0.6207386501873389, "grad_norm": 0.6037611335695884, "learning_rate": 1.3047402254025654e-05, "loss": 0.0605, "step": 7973 }, { "epoch": 0.6208165052795485, "grad_norm": 0.7347456688165509, "learning_rate": 1.3042721995076568e-05, "loss": 0.0848, "step": 7974 }, { "epoch": 0.620894360371758, "grad_norm": 0.6188406818507732, "learning_rate": 1.3038042169533203e-05, "loss": 0.0463, "step": 7975 }, { "epoch": 0.6209722154639676, "grad_norm": 0.6607967528900336, "learning_rate": 1.3033362777687075e-05, "loss": 0.0636, "step": 7976 }, { "epoch": 0.6210500705561773, "grad_norm": 0.5917164830880859, "learning_rate": 1.3028683819829708e-05, "loss": 0.0569, "step": 7977 }, { "epoch": 0.6211279256483869, "grad_norm": 0.6825914869443629, "learning_rate": 1.3024005296252572e-05, "loss": 0.0937, "step": 7978 }, { "epoch": 0.6212057807405965, "grad_norm": 0.629605694209393, "learning_rate": 1.3019327207247112e-05, "loss": 0.0703, "step": 7979 }, { "epoch": 0.6212836358328062, "grad_norm": 0.5789645031900147, "learning_rate": 1.301464955310476e-05, "loss": 0.0619, "step": 7980 }, { "epoch": 0.6213614909250158, "grad_norm": 0.5820390493304001, "learning_rate": 1.3009972334116899e-05, "loss": 0.0488, "step": 7981 }, { "epoch": 0.6214393460172254, "grad_norm": 0.6157238872734678, "learning_rate": 1.3005295550574907e-05, "loss": 0.0506, "step": 7982 }, { "epoch": 0.6215172011094351, "grad_norm": 0.6694381513455483, "learning_rate": 1.300061920277013e-05, "loss": 0.0671, "step": 7983 }, { "epoch": 0.6215950562016447, "grad_norm": 0.630722590927631, "learning_rate": 1.299594329099388e-05, "loss": 0.0711, "step": 7984 }, { "epoch": 0.6216729112938543, "grad_norm": 0.7345647296860597, "learning_rate": 1.2991267815537433e-05, "loss": 0.1, "step": 7985 }, { "epoch": 0.621750766386064, "grad_norm": 0.5970107912813825, "learning_rate": 1.2986592776692053e-05, "loss": 0.0557, "step": 7986 }, { "epoch": 0.6218286214782736, "grad_norm": 0.6438282232121181, "learning_rate": 1.2981918174748985e-05, "loss": 0.0817, "step": 7987 }, { "epoch": 0.6219064765704831, "grad_norm": 0.586822033270228, "learning_rate": 1.2977244009999426e-05, "loss": 0.0674, "step": 7988 }, { "epoch": 0.6219843316626928, "grad_norm": 0.6198365907146469, "learning_rate": 1.2972570282734558e-05, "loss": 0.0564, "step": 7989 }, { "epoch": 0.6220621867549024, "grad_norm": 0.6518191040219345, "learning_rate": 1.2967896993245526e-05, "loss": 0.0816, "step": 7990 }, { "epoch": 0.622140041847112, "grad_norm": 0.5665764922799417, "learning_rate": 1.296322414182345e-05, "loss": 0.0635, "step": 7991 }, { "epoch": 0.6222178969393217, "grad_norm": 0.6706982516010074, "learning_rate": 1.2958551728759444e-05, "loss": 0.0722, "step": 7992 }, { "epoch": 0.6222957520315313, "grad_norm": 0.6061888257855473, "learning_rate": 1.2953879754344569e-05, "loss": 0.056, "step": 7993 }, { "epoch": 0.6223736071237409, "grad_norm": 0.5863602062284925, "learning_rate": 1.2949208218869864e-05, "loss": 0.0723, "step": 7994 }, { "epoch": 0.6224514622159506, "grad_norm": 0.6603107183901523, "learning_rate": 1.2944537122626341e-05, "loss": 0.0851, "step": 7995 }, { "epoch": 0.6225293173081602, "grad_norm": 0.5994152540187029, "learning_rate": 1.2939866465904989e-05, "loss": 0.0474, "step": 7996 }, { "epoch": 0.6226071724003698, "grad_norm": 0.5861817638768347, "learning_rate": 1.2935196248996775e-05, "loss": 0.0507, "step": 7997 }, { "epoch": 0.6226850274925795, "grad_norm": 0.6743053493392884, "learning_rate": 1.2930526472192634e-05, "loss": 0.0724, "step": 7998 }, { "epoch": 0.6227628825847891, "grad_norm": 0.7325811746865749, "learning_rate": 1.2925857135783457e-05, "loss": 0.1028, "step": 7999 }, { "epoch": 0.6228407376769987, "grad_norm": 0.6897804779812972, "learning_rate": 1.2921188240060131e-05, "loss": 0.0906, "step": 8000 }, { "epoch": 0.6228407376769987, "eval_loss": 0.009106393903493881, "eval_runtime": 163.0255, "eval_samples_per_second": 17.666, "eval_steps_per_second": 0.632, "step": 8000 }, { "epoch": 0.6229185927692084, "grad_norm": 0.6427691569742798, "learning_rate": 1.2916519785313495e-05, "loss": 0.066, "step": 8001 }, { "epoch": 0.622996447861418, "grad_norm": 0.6084795416034321, "learning_rate": 1.2911851771834394e-05, "loss": 0.0576, "step": 8002 }, { "epoch": 0.6230743029536275, "grad_norm": 0.5546243185844293, "learning_rate": 1.2907184199913604e-05, "loss": 0.0516, "step": 8003 }, { "epoch": 0.6231521580458372, "grad_norm": 0.6782640046972782, "learning_rate": 1.2902517069841904e-05, "loss": 0.0784, "step": 8004 }, { "epoch": 0.6232300131380468, "grad_norm": 0.6250807069208777, "learning_rate": 1.2897850381910026e-05, "loss": 0.0691, "step": 8005 }, { "epoch": 0.6233078682302564, "grad_norm": 0.5483499717308166, "learning_rate": 1.2893184136408681e-05, "loss": 0.0464, "step": 8006 }, { "epoch": 0.6233857233224661, "grad_norm": 0.7849159400788469, "learning_rate": 1.2888518333628568e-05, "loss": 0.1139, "step": 8007 }, { "epoch": 0.6234635784146757, "grad_norm": 0.6340126263403768, "learning_rate": 1.2883852973860333e-05, "loss": 0.0655, "step": 8008 }, { "epoch": 0.6235414335068853, "grad_norm": 0.6088608739016005, "learning_rate": 1.287918805739461e-05, "loss": 0.0556, "step": 8009 }, { "epoch": 0.623619288599095, "grad_norm": 0.6234964594316209, "learning_rate": 1.2874523584522e-05, "loss": 0.0725, "step": 8010 }, { "epoch": 0.6236971436913046, "grad_norm": 0.7064767013639226, "learning_rate": 1.2869859555533074e-05, "loss": 0.072, "step": 8011 }, { "epoch": 0.6237749987835142, "grad_norm": 0.6883886121454416, "learning_rate": 1.2865195970718385e-05, "loss": 0.0956, "step": 8012 }, { "epoch": 0.6238528538757238, "grad_norm": 0.7035681924574614, "learning_rate": 1.286053283036845e-05, "loss": 0.0736, "step": 8013 }, { "epoch": 0.6239307089679335, "grad_norm": 0.5855331650364938, "learning_rate": 1.2855870134773767e-05, "loss": 0.0605, "step": 8014 }, { "epoch": 0.624008564060143, "grad_norm": 0.6129755762194263, "learning_rate": 1.285120788422479e-05, "loss": 0.0655, "step": 8015 }, { "epoch": 0.6240864191523526, "grad_norm": 0.6425367738770027, "learning_rate": 1.2846546079011959e-05, "loss": 0.0843, "step": 8016 }, { "epoch": 0.6241642742445623, "grad_norm": 0.6871446843735669, "learning_rate": 1.2841884719425681e-05, "loss": 0.0516, "step": 8017 }, { "epoch": 0.6242421293367719, "grad_norm": 0.77960109023734, "learning_rate": 1.283722380575634e-05, "loss": 0.1129, "step": 8018 }, { "epoch": 0.6243199844289815, "grad_norm": 0.5463044469361206, "learning_rate": 1.283256333829429e-05, "loss": 0.0512, "step": 8019 }, { "epoch": 0.6243978395211912, "grad_norm": 0.605319526106862, "learning_rate": 1.2827903317329855e-05, "loss": 0.0519, "step": 8020 }, { "epoch": 0.6244756946134008, "grad_norm": 0.6056523132286646, "learning_rate": 1.282324374315333e-05, "loss": 0.0869, "step": 8021 }, { "epoch": 0.6245535497056104, "grad_norm": 0.6916818902623288, "learning_rate": 1.2818584616054981e-05, "loss": 0.0867, "step": 8022 }, { "epoch": 0.6246314047978201, "grad_norm": 0.6260558100432645, "learning_rate": 1.2813925936325063e-05, "loss": 0.0638, "step": 8023 }, { "epoch": 0.6247092598900297, "grad_norm": 0.6348771722420445, "learning_rate": 1.2809267704253778e-05, "loss": 0.0964, "step": 8024 }, { "epoch": 0.6247871149822393, "grad_norm": 0.6555196962663481, "learning_rate": 1.2804609920131315e-05, "loss": 0.0733, "step": 8025 }, { "epoch": 0.624864970074449, "grad_norm": 0.715383296025743, "learning_rate": 1.2799952584247834e-05, "loss": 0.0764, "step": 8026 }, { "epoch": 0.6249428251666586, "grad_norm": 0.663746100656454, "learning_rate": 1.2795295696893461e-05, "loss": 0.0808, "step": 8027 }, { "epoch": 0.6250206802588681, "grad_norm": 0.6145732100956791, "learning_rate": 1.2790639258358302e-05, "loss": 0.067, "step": 8028 }, { "epoch": 0.6250985353510778, "grad_norm": 0.7899851115892638, "learning_rate": 1.2785983268932433e-05, "loss": 0.1311, "step": 8029 }, { "epoch": 0.6251763904432874, "grad_norm": 0.6227177334545494, "learning_rate": 1.2781327728905896e-05, "loss": 0.0642, "step": 8030 }, { "epoch": 0.625254245535497, "grad_norm": 0.5834491945494207, "learning_rate": 1.2776672638568712e-05, "loss": 0.055, "step": 8031 }, { "epoch": 0.6253321006277067, "grad_norm": 0.5867326381288868, "learning_rate": 1.2772017998210866e-05, "loss": 0.055, "step": 8032 }, { "epoch": 0.6254099557199163, "grad_norm": 0.7126385248067438, "learning_rate": 1.2767363808122327e-05, "loss": 0.0822, "step": 8033 }, { "epoch": 0.6254878108121259, "grad_norm": 0.6475334066097308, "learning_rate": 1.2762710068593028e-05, "loss": 0.0618, "step": 8034 }, { "epoch": 0.6255656659043356, "grad_norm": 0.6431558174763539, "learning_rate": 1.2758056779912871e-05, "loss": 0.0638, "step": 8035 }, { "epoch": 0.6256435209965452, "grad_norm": 0.6680984693555537, "learning_rate": 1.2753403942371739e-05, "loss": 0.0809, "step": 8036 }, { "epoch": 0.6257213760887548, "grad_norm": 0.6431622988219979, "learning_rate": 1.2748751556259474e-05, "loss": 0.0686, "step": 8037 }, { "epoch": 0.6257992311809645, "grad_norm": 0.5857826124872638, "learning_rate": 1.2744099621865906e-05, "loss": 0.0654, "step": 8038 }, { "epoch": 0.6258770862731741, "grad_norm": 0.7102993699590295, "learning_rate": 1.2739448139480827e-05, "loss": 0.109, "step": 8039 }, { "epoch": 0.6259549413653837, "grad_norm": 0.5468785241373832, "learning_rate": 1.2734797109394e-05, "loss": 0.0479, "step": 8040 }, { "epoch": 0.6260327964575934, "grad_norm": 0.6931206486705164, "learning_rate": 1.2730146531895167e-05, "loss": 0.0641, "step": 8041 }, { "epoch": 0.6261106515498029, "grad_norm": 0.5406024597485659, "learning_rate": 1.2725496407274027e-05, "loss": 0.0428, "step": 8042 }, { "epoch": 0.6261885066420125, "grad_norm": 0.64516797289237, "learning_rate": 1.2720846735820272e-05, "loss": 0.0758, "step": 8043 }, { "epoch": 0.6262663617342222, "grad_norm": 0.649025724280735, "learning_rate": 1.2716197517823552e-05, "loss": 0.0568, "step": 8044 }, { "epoch": 0.6263442168264318, "grad_norm": 0.648278095368668, "learning_rate": 1.2711548753573488e-05, "loss": 0.0748, "step": 8045 }, { "epoch": 0.6264220719186414, "grad_norm": 0.6776157393273678, "learning_rate": 1.2706900443359684e-05, "loss": 0.0852, "step": 8046 }, { "epoch": 0.626499927010851, "grad_norm": 0.5920275045513246, "learning_rate": 1.2702252587471687e-05, "loss": 0.0605, "step": 8047 }, { "epoch": 0.6265777821030607, "grad_norm": 0.654675139833594, "learning_rate": 1.269760518619906e-05, "loss": 0.0586, "step": 8048 }, { "epoch": 0.6266556371952703, "grad_norm": 0.6610708535648756, "learning_rate": 1.2692958239831308e-05, "loss": 0.073, "step": 8049 }, { "epoch": 0.6267334922874799, "grad_norm": 0.5775926629696981, "learning_rate": 1.2688311748657909e-05, "loss": 0.0511, "step": 8050 }, { "epoch": 0.6267334922874799, "eval_loss": 0.008915742859244347, "eval_runtime": 162.6573, "eval_samples_per_second": 17.706, "eval_steps_per_second": 0.633, "step": 8050 }, { "epoch": 0.6268113473796896, "grad_norm": 0.6831485321943519, "learning_rate": 1.2683665712968324e-05, "loss": 0.0942, "step": 8051 }, { "epoch": 0.6268892024718992, "grad_norm": 0.6022416979915464, "learning_rate": 1.2679020133051974e-05, "loss": 0.0705, "step": 8052 }, { "epoch": 0.6269670575641088, "grad_norm": 0.567498721199782, "learning_rate": 1.2674375009198246e-05, "loss": 0.0653, "step": 8053 }, { "epoch": 0.6270449126563185, "grad_norm": 0.7076720889062147, "learning_rate": 1.2669730341696532e-05, "loss": 0.093, "step": 8054 }, { "epoch": 0.627122767748528, "grad_norm": 0.6289936427503195, "learning_rate": 1.2665086130836163e-05, "loss": 0.0605, "step": 8055 }, { "epoch": 0.6272006228407376, "grad_norm": 0.694321978779589, "learning_rate": 1.2660442376906453e-05, "loss": 0.0739, "step": 8056 }, { "epoch": 0.6272784779329473, "grad_norm": 0.607353233987776, "learning_rate": 1.265579908019668e-05, "loss": 0.0712, "step": 8057 }, { "epoch": 0.6273563330251569, "grad_norm": 0.6338634405154284, "learning_rate": 1.2651156240996096e-05, "loss": 0.0778, "step": 8058 }, { "epoch": 0.6274341881173665, "grad_norm": 0.695771101562081, "learning_rate": 1.2646513859593945e-05, "loss": 0.0823, "step": 8059 }, { "epoch": 0.6275120432095762, "grad_norm": 0.5894635843419341, "learning_rate": 1.2641871936279419e-05, "loss": 0.0458, "step": 8060 }, { "epoch": 0.6275898983017858, "grad_norm": 0.6483912984020701, "learning_rate": 1.2637230471341679e-05, "loss": 0.063, "step": 8061 }, { "epoch": 0.6276677533939954, "grad_norm": 0.6514617383528057, "learning_rate": 1.2632589465069875e-05, "loss": 0.0577, "step": 8062 }, { "epoch": 0.6277456084862051, "grad_norm": 0.6222861651357751, "learning_rate": 1.262794891775311e-05, "loss": 0.0666, "step": 8063 }, { "epoch": 0.6278234635784147, "grad_norm": 0.6532416747968366, "learning_rate": 1.2623308829680488e-05, "loss": 0.0708, "step": 8064 }, { "epoch": 0.6279013186706243, "grad_norm": 0.6913332115987867, "learning_rate": 1.2618669201141045e-05, "loss": 0.0669, "step": 8065 }, { "epoch": 0.627979173762834, "grad_norm": 0.5603044146924374, "learning_rate": 1.2614030032423818e-05, "loss": 0.0674, "step": 8066 }, { "epoch": 0.6280570288550436, "grad_norm": 0.638876565761787, "learning_rate": 1.2609391323817803e-05, "loss": 0.0689, "step": 8067 }, { "epoch": 0.6281348839472531, "grad_norm": 0.6340721217201851, "learning_rate": 1.2604753075611961e-05, "loss": 0.0636, "step": 8068 }, { "epoch": 0.6282127390394628, "grad_norm": 0.6459704705250123, "learning_rate": 1.2600115288095254e-05, "loss": 0.0602, "step": 8069 }, { "epoch": 0.6282905941316724, "grad_norm": 0.6779002584216786, "learning_rate": 1.259547796155658e-05, "loss": 0.0842, "step": 8070 }, { "epoch": 0.628368449223882, "grad_norm": 0.6678253262036348, "learning_rate": 1.259084109628482e-05, "loss": 0.0598, "step": 8071 }, { "epoch": 0.6284463043160917, "grad_norm": 0.6134970626437973, "learning_rate": 1.2586204692568838e-05, "loss": 0.0817, "step": 8072 }, { "epoch": 0.6285241594083013, "grad_norm": 0.7428571952040227, "learning_rate": 1.2581568750697448e-05, "loss": 0.0776, "step": 8073 }, { "epoch": 0.6286020145005109, "grad_norm": 0.6470330033534759, "learning_rate": 1.2576933270959462e-05, "loss": 0.0826, "step": 8074 }, { "epoch": 0.6286798695927206, "grad_norm": 0.5619079754305951, "learning_rate": 1.257229825364364e-05, "loss": 0.0767, "step": 8075 }, { "epoch": 0.6287577246849302, "grad_norm": 0.5691332476708754, "learning_rate": 1.2567663699038724e-05, "loss": 0.0656, "step": 8076 }, { "epoch": 0.6288355797771398, "grad_norm": 0.6134625348287837, "learning_rate": 1.2563029607433426e-05, "loss": 0.0604, "step": 8077 }, { "epoch": 0.6289134348693495, "grad_norm": 0.5634684283838821, "learning_rate": 1.255839597911642e-05, "loss": 0.0551, "step": 8078 }, { "epoch": 0.6289912899615591, "grad_norm": 0.6628436790977493, "learning_rate": 1.255376281437637e-05, "loss": 0.0662, "step": 8079 }, { "epoch": 0.6290691450537687, "grad_norm": 0.5895445175364884, "learning_rate": 1.25491301135019e-05, "loss": 0.0555, "step": 8080 }, { "epoch": 0.6291470001459784, "grad_norm": 0.6759541404337929, "learning_rate": 1.2544497876781598e-05, "loss": 0.0678, "step": 8081 }, { "epoch": 0.6292248552381879, "grad_norm": 0.5882545738699869, "learning_rate": 1.2539866104504035e-05, "loss": 0.0466, "step": 8082 }, { "epoch": 0.6293027103303975, "grad_norm": 0.6785067824271982, "learning_rate": 1.2535234796957743e-05, "loss": 0.0855, "step": 8083 }, { "epoch": 0.6293805654226071, "grad_norm": 0.5910873948595649, "learning_rate": 1.2530603954431241e-05, "loss": 0.0593, "step": 8084 }, { "epoch": 0.6294584205148168, "grad_norm": 0.6329040565587019, "learning_rate": 1.2525973577213004e-05, "loss": 0.0574, "step": 8085 }, { "epoch": 0.6295362756070264, "grad_norm": 0.6340016961520296, "learning_rate": 1.2521343665591482e-05, "loss": 0.0761, "step": 8086 }, { "epoch": 0.629614130699236, "grad_norm": 0.6641140170237421, "learning_rate": 1.2516714219855098e-05, "loss": 0.0736, "step": 8087 }, { "epoch": 0.6296919857914457, "grad_norm": 0.5892106595024023, "learning_rate": 1.2512085240292238e-05, "loss": 0.0519, "step": 8088 }, { "epoch": 0.6297698408836553, "grad_norm": 0.6048416946321361, "learning_rate": 1.2507456727191273e-05, "loss": 0.0547, "step": 8089 }, { "epoch": 0.6298476959758649, "grad_norm": 0.5855901982146454, "learning_rate": 1.250282868084054e-05, "loss": 0.058, "step": 8090 }, { "epoch": 0.6299255510680746, "grad_norm": 0.5266865465447456, "learning_rate": 1.2498201101528338e-05, "loss": 0.0377, "step": 8091 }, { "epoch": 0.6300034061602842, "grad_norm": 0.5915627116726274, "learning_rate": 1.2493573989542944e-05, "loss": 0.057, "step": 8092 }, { "epoch": 0.6300812612524938, "grad_norm": 0.6047827944948762, "learning_rate": 1.2488947345172611e-05, "loss": 0.0616, "step": 8093 }, { "epoch": 0.6301591163447035, "grad_norm": 0.7666665491472848, "learning_rate": 1.2484321168705549e-05, "loss": 0.0913, "step": 8094 }, { "epoch": 0.630236971436913, "grad_norm": 0.6170469436483982, "learning_rate": 1.2479695460429952e-05, "loss": 0.0681, "step": 8095 }, { "epoch": 0.6303148265291226, "grad_norm": 0.7037527063619574, "learning_rate": 1.247507022063398e-05, "loss": 0.0927, "step": 8096 }, { "epoch": 0.6303926816213323, "grad_norm": 0.7258859692759081, "learning_rate": 1.2470445449605766e-05, "loss": 0.0919, "step": 8097 }, { "epoch": 0.6304705367135419, "grad_norm": 0.5863622004378556, "learning_rate": 1.2465821147633409e-05, "loss": 0.0688, "step": 8098 }, { "epoch": 0.6305483918057515, "grad_norm": 0.5917260869500961, "learning_rate": 1.2461197315004972e-05, "loss": 0.0504, "step": 8099 }, { "epoch": 0.6306262468979612, "grad_norm": 0.7348064923054194, "learning_rate": 1.2456573952008513e-05, "loss": 0.1027, "step": 8100 }, { "epoch": 0.6306262468979612, "eval_loss": 0.008844947442412376, "eval_runtime": 163.1472, "eval_samples_per_second": 17.653, "eval_steps_per_second": 0.631, "step": 8100 }, { "epoch": 0.6307041019901708, "grad_norm": 0.7449291262078414, "learning_rate": 1.2451951058932039e-05, "loss": 0.0939, "step": 8101 }, { "epoch": 0.6307819570823804, "grad_norm": 0.5673743657959806, "learning_rate": 1.2447328636063535e-05, "loss": 0.0473, "step": 8102 }, { "epoch": 0.6308598121745901, "grad_norm": 0.5841904310194417, "learning_rate": 1.244270668369096e-05, "loss": 0.0411, "step": 8103 }, { "epoch": 0.6309376672667997, "grad_norm": 0.6849883520833433, "learning_rate": 1.2438085202102228e-05, "loss": 0.0995, "step": 8104 }, { "epoch": 0.6310155223590093, "grad_norm": 0.6337115268069545, "learning_rate": 1.243346419158525e-05, "loss": 0.0706, "step": 8105 }, { "epoch": 0.631093377451219, "grad_norm": 0.6051796328403894, "learning_rate": 1.2428843652427886e-05, "loss": 0.0656, "step": 8106 }, { "epoch": 0.6311712325434286, "grad_norm": 0.6014169492343847, "learning_rate": 1.2424223584917973e-05, "loss": 0.0732, "step": 8107 }, { "epoch": 0.6312490876356381, "grad_norm": 0.6046921019082004, "learning_rate": 1.2419603989343326e-05, "loss": 0.0649, "step": 8108 }, { "epoch": 0.6313269427278478, "grad_norm": 0.6486224305848944, "learning_rate": 1.2414984865991713e-05, "loss": 0.0877, "step": 8109 }, { "epoch": 0.6314047978200574, "grad_norm": 0.6852445902618974, "learning_rate": 1.2410366215150891e-05, "loss": 0.0687, "step": 8110 }, { "epoch": 0.631482652912267, "grad_norm": 0.5602399095534389, "learning_rate": 1.2405748037108585e-05, "loss": 0.0668, "step": 8111 }, { "epoch": 0.6315605080044767, "grad_norm": 0.5574728254805452, "learning_rate": 1.2401130332152475e-05, "loss": 0.0692, "step": 8112 }, { "epoch": 0.6316383630966863, "grad_norm": 0.686137853974497, "learning_rate": 1.2396513100570236e-05, "loss": 0.0929, "step": 8113 }, { "epoch": 0.6317162181888959, "grad_norm": 0.6385528283599888, "learning_rate": 1.2391896342649474e-05, "loss": 0.0848, "step": 8114 }, { "epoch": 0.6317940732811056, "grad_norm": 0.5728597587349957, "learning_rate": 1.2387280058677822e-05, "loss": 0.0682, "step": 8115 }, { "epoch": 0.6318719283733152, "grad_norm": 0.5918029123799242, "learning_rate": 1.2382664248942837e-05, "loss": 0.0469, "step": 8116 }, { "epoch": 0.6319497834655248, "grad_norm": 0.7134674948249219, "learning_rate": 1.2378048913732066e-05, "loss": 0.0984, "step": 8117 }, { "epoch": 0.6320276385577344, "grad_norm": 0.6225961739959537, "learning_rate": 1.2373434053333022e-05, "loss": 0.0619, "step": 8118 }, { "epoch": 0.6321054936499441, "grad_norm": 0.7076132860741308, "learning_rate": 1.2368819668033179e-05, "loss": 0.0906, "step": 8119 }, { "epoch": 0.6321833487421537, "grad_norm": 0.6559358519075557, "learning_rate": 1.2364205758120009e-05, "loss": 0.0779, "step": 8120 }, { "epoch": 0.6322612038343632, "grad_norm": 0.6538846215775967, "learning_rate": 1.2359592323880927e-05, "loss": 0.0833, "step": 8121 }, { "epoch": 0.6323390589265729, "grad_norm": 0.6127457778280707, "learning_rate": 1.2354979365603337e-05, "loss": 0.0583, "step": 8122 }, { "epoch": 0.6324169140187825, "grad_norm": 0.5994335300508579, "learning_rate": 1.2350366883574592e-05, "loss": 0.038, "step": 8123 }, { "epoch": 0.6324947691109921, "grad_norm": 0.6979021486393475, "learning_rate": 1.2345754878082026e-05, "loss": 0.0908, "step": 8124 }, { "epoch": 0.6325726242032018, "grad_norm": 0.6768484906077333, "learning_rate": 1.2341143349412961e-05, "loss": 0.0654, "step": 8125 }, { "epoch": 0.6326504792954114, "grad_norm": 0.721653996345239, "learning_rate": 1.2336532297854666e-05, "loss": 0.0938, "step": 8126 }, { "epoch": 0.632728334387621, "grad_norm": 0.6638814930009749, "learning_rate": 1.2331921723694391e-05, "loss": 0.0706, "step": 8127 }, { "epoch": 0.6328061894798307, "grad_norm": 0.6167609060797856, "learning_rate": 1.2327311627219346e-05, "loss": 0.0663, "step": 8128 }, { "epoch": 0.6328840445720403, "grad_norm": 0.6175225087206996, "learning_rate": 1.232270200871672e-05, "loss": 0.0591, "step": 8129 }, { "epoch": 0.6329618996642499, "grad_norm": 0.7518105220310894, "learning_rate": 1.2318092868473665e-05, "loss": 0.1144, "step": 8130 }, { "epoch": 0.6330397547564596, "grad_norm": 0.6202438992609616, "learning_rate": 1.2313484206777329e-05, "loss": 0.0582, "step": 8131 }, { "epoch": 0.6331176098486692, "grad_norm": 0.5484105070346992, "learning_rate": 1.2308876023914794e-05, "loss": 0.0444, "step": 8132 }, { "epoch": 0.6331954649408787, "grad_norm": 0.6056478860973559, "learning_rate": 1.2304268320173129e-05, "loss": 0.0654, "step": 8133 }, { "epoch": 0.6332733200330884, "grad_norm": 0.6231048573452955, "learning_rate": 1.2299661095839378e-05, "loss": 0.076, "step": 8134 }, { "epoch": 0.633351175125298, "grad_norm": 0.5727018241934093, "learning_rate": 1.2295054351200532e-05, "loss": 0.0684, "step": 8135 }, { "epoch": 0.6334290302175076, "grad_norm": 0.5268647576267245, "learning_rate": 1.22904480865436e-05, "loss": 0.0503, "step": 8136 }, { "epoch": 0.6335068853097173, "grad_norm": 0.6967430102957284, "learning_rate": 1.2285842302155509e-05, "loss": 0.0857, "step": 8137 }, { "epoch": 0.6335847404019269, "grad_norm": 0.7847758072092921, "learning_rate": 1.2281236998323183e-05, "loss": 0.138, "step": 8138 }, { "epoch": 0.6336625954941365, "grad_norm": 0.6181199007982003, "learning_rate": 1.2276632175333512e-05, "loss": 0.0681, "step": 8139 }, { "epoch": 0.6337404505863462, "grad_norm": 0.6019946137751369, "learning_rate": 1.2272027833473348e-05, "loss": 0.0686, "step": 8140 }, { "epoch": 0.6338183056785558, "grad_norm": 0.6102029817586184, "learning_rate": 1.2267423973029527e-05, "loss": 0.0601, "step": 8141 }, { "epoch": 0.6338961607707654, "grad_norm": 0.6533637809399533, "learning_rate": 1.226282059428885e-05, "loss": 0.0715, "step": 8142 }, { "epoch": 0.6339740158629751, "grad_norm": 0.5285138005875646, "learning_rate": 1.225821769753808e-05, "loss": 0.0466, "step": 8143 }, { "epoch": 0.6340518709551847, "grad_norm": 0.6634174952833286, "learning_rate": 1.2253615283063957e-05, "loss": 0.0792, "step": 8144 }, { "epoch": 0.6341297260473943, "grad_norm": 0.6686272831228827, "learning_rate": 1.2249013351153187e-05, "loss": 0.0812, "step": 8145 }, { "epoch": 0.634207581139604, "grad_norm": 0.5410386403303041, "learning_rate": 1.224441190209246e-05, "loss": 0.0471, "step": 8146 }, { "epoch": 0.6342854362318135, "grad_norm": 0.6662433727690587, "learning_rate": 1.2239810936168413e-05, "loss": 0.0634, "step": 8147 }, { "epoch": 0.6343632913240231, "grad_norm": 0.5709213081672907, "learning_rate": 1.2235210453667672e-05, "loss": 0.0404, "step": 8148 }, { "epoch": 0.6344411464162328, "grad_norm": 0.6143467713026095, "learning_rate": 1.2230610454876817e-05, "loss": 0.0557, "step": 8149 }, { "epoch": 0.6345190015084424, "grad_norm": 0.6537712211851269, "learning_rate": 1.2226010940082412e-05, "loss": 0.0638, "step": 8150 }, { "epoch": 0.6345190015084424, "eval_loss": 0.00883497204631567, "eval_runtime": 162.8453, "eval_samples_per_second": 17.686, "eval_steps_per_second": 0.633, "step": 8150 }, { "epoch": 0.634596856600652, "grad_norm": 0.6964260514021456, "learning_rate": 1.2221411909570985e-05, "loss": 0.0989, "step": 8151 }, { "epoch": 0.6346747116928617, "grad_norm": 0.5474885626328625, "learning_rate": 1.2216813363629037e-05, "loss": 0.0634, "step": 8152 }, { "epoch": 0.6347525667850713, "grad_norm": 0.533066575625455, "learning_rate": 1.2212215302543029e-05, "loss": 0.0527, "step": 8153 }, { "epoch": 0.6348304218772809, "grad_norm": 0.6403946581707735, "learning_rate": 1.2207617726599404e-05, "loss": 0.0738, "step": 8154 }, { "epoch": 0.6349082769694905, "grad_norm": 0.6226182003609253, "learning_rate": 1.2203020636084562e-05, "loss": 0.0688, "step": 8155 }, { "epoch": 0.6349861320617002, "grad_norm": 0.6927362147352604, "learning_rate": 1.2198424031284888e-05, "loss": 0.0779, "step": 8156 }, { "epoch": 0.6350639871539098, "grad_norm": 0.6062107646795748, "learning_rate": 1.219382791248673e-05, "loss": 0.0612, "step": 8157 }, { "epoch": 0.6351418422461194, "grad_norm": 0.6192641315117918, "learning_rate": 1.2189232279976398e-05, "loss": 0.0658, "step": 8158 }, { "epoch": 0.6352196973383291, "grad_norm": 0.5828523714031106, "learning_rate": 1.218463713404018e-05, "loss": 0.0535, "step": 8159 }, { "epoch": 0.6352975524305386, "grad_norm": 0.6066205147500582, "learning_rate": 1.2180042474964333e-05, "loss": 0.0666, "step": 8160 }, { "epoch": 0.6353754075227482, "grad_norm": 0.6614048656193915, "learning_rate": 1.2175448303035085e-05, "loss": 0.0713, "step": 8161 }, { "epoch": 0.6354532626149579, "grad_norm": 0.5321338706795703, "learning_rate": 1.2170854618538628e-05, "loss": 0.0366, "step": 8162 }, { "epoch": 0.6355311177071675, "grad_norm": 0.5978346388685107, "learning_rate": 1.2166261421761129e-05, "loss": 0.0482, "step": 8163 }, { "epoch": 0.6356089727993771, "grad_norm": 0.633833917018547, "learning_rate": 1.2161668712988721e-05, "loss": 0.0654, "step": 8164 }, { "epoch": 0.6356868278915868, "grad_norm": 0.5488268811475204, "learning_rate": 1.2157076492507507e-05, "loss": 0.0522, "step": 8165 }, { "epoch": 0.6357646829837964, "grad_norm": 0.5850211653583449, "learning_rate": 1.2152484760603564e-05, "loss": 0.0546, "step": 8166 }, { "epoch": 0.635842538076006, "grad_norm": 0.5649236314642461, "learning_rate": 1.2147893517562932e-05, "loss": 0.0424, "step": 8167 }, { "epoch": 0.6359203931682157, "grad_norm": 0.6001634748072252, "learning_rate": 1.2143302763671629e-05, "loss": 0.0687, "step": 8168 }, { "epoch": 0.6359982482604253, "grad_norm": 0.6367785550501026, "learning_rate": 1.2138712499215633e-05, "loss": 0.0621, "step": 8169 }, { "epoch": 0.6360761033526349, "grad_norm": 0.6404109055836014, "learning_rate": 1.21341227244809e-05, "loss": 0.0832, "step": 8170 }, { "epoch": 0.6361539584448446, "grad_norm": 0.6418466179553867, "learning_rate": 1.2129533439753338e-05, "loss": 0.0869, "step": 8171 }, { "epoch": 0.6362318135370542, "grad_norm": 0.611581397554825, "learning_rate": 1.212494464531886e-05, "loss": 0.0617, "step": 8172 }, { "epoch": 0.6363096686292637, "grad_norm": 0.5827325792356473, "learning_rate": 1.212035634146331e-05, "loss": 0.0619, "step": 8173 }, { "epoch": 0.6363875237214734, "grad_norm": 0.6116143033567273, "learning_rate": 1.211576852847252e-05, "loss": 0.0757, "step": 8174 }, { "epoch": 0.636465378813683, "grad_norm": 0.529184487108592, "learning_rate": 1.2111181206632298e-05, "loss": 0.0556, "step": 8175 }, { "epoch": 0.6365432339058926, "grad_norm": 0.6509824998610936, "learning_rate": 1.2106594376228395e-05, "loss": 0.0725, "step": 8176 }, { "epoch": 0.6366210889981023, "grad_norm": 0.6545995948924642, "learning_rate": 1.2102008037546567e-05, "loss": 0.0736, "step": 8177 }, { "epoch": 0.6366989440903119, "grad_norm": 0.6981298144872443, "learning_rate": 1.2097422190872514e-05, "loss": 0.0693, "step": 8178 }, { "epoch": 0.6367767991825215, "grad_norm": 0.6386234359440481, "learning_rate": 1.2092836836491918e-05, "loss": 0.0677, "step": 8179 }, { "epoch": 0.6368546542747312, "grad_norm": 0.5458953256398291, "learning_rate": 1.2088251974690418e-05, "loss": 0.0532, "step": 8180 }, { "epoch": 0.6369325093669408, "grad_norm": 0.4881489552863652, "learning_rate": 1.2083667605753622e-05, "loss": 0.0452, "step": 8181 }, { "epoch": 0.6370103644591504, "grad_norm": 0.5855178704870346, "learning_rate": 1.2079083729967132e-05, "loss": 0.0517, "step": 8182 }, { "epoch": 0.6370882195513601, "grad_norm": 0.5395573937600119, "learning_rate": 1.2074500347616493e-05, "loss": 0.0603, "step": 8183 }, { "epoch": 0.6371660746435697, "grad_norm": 0.5653851018389318, "learning_rate": 1.2069917458987233e-05, "loss": 0.0656, "step": 8184 }, { "epoch": 0.6372439297357793, "grad_norm": 0.6787941803263293, "learning_rate": 1.2065335064364838e-05, "loss": 0.0859, "step": 8185 }, { "epoch": 0.637321784827989, "grad_norm": 0.635430348445698, "learning_rate": 1.2060753164034764e-05, "loss": 0.0506, "step": 8186 }, { "epoch": 0.6373996399201985, "grad_norm": 0.6580083540395962, "learning_rate": 1.205617175828246e-05, "loss": 0.0676, "step": 8187 }, { "epoch": 0.6374774950124081, "grad_norm": 0.7059896041033716, "learning_rate": 1.2051590847393313e-05, "loss": 0.0923, "step": 8188 }, { "epoch": 0.6375553501046178, "grad_norm": 0.5933386512782133, "learning_rate": 1.2047010431652702e-05, "loss": 0.0711, "step": 8189 }, { "epoch": 0.6376332051968274, "grad_norm": 0.5795662835546904, "learning_rate": 1.2042430511345952e-05, "loss": 0.069, "step": 8190 }, { "epoch": 0.637711060289037, "grad_norm": 0.7157177505146846, "learning_rate": 1.2037851086758369e-05, "loss": 0.1, "step": 8191 }, { "epoch": 0.6377889153812466, "grad_norm": 0.5397619913901364, "learning_rate": 1.2033272158175248e-05, "loss": 0.0456, "step": 8192 }, { "epoch": 0.6378667704734563, "grad_norm": 0.4873711725638301, "learning_rate": 1.2028693725881828e-05, "loss": 0.0348, "step": 8193 }, { "epoch": 0.6379446255656659, "grad_norm": 0.5246125637195099, "learning_rate": 1.2024115790163316e-05, "loss": 0.0477, "step": 8194 }, { "epoch": 0.6380224806578755, "grad_norm": 0.6696390036593127, "learning_rate": 1.2019538351304902e-05, "loss": 0.061, "step": 8195 }, { "epoch": 0.6381003357500852, "grad_norm": 0.6445676401597987, "learning_rate": 1.2014961409591727e-05, "loss": 0.0664, "step": 8196 }, { "epoch": 0.6381781908422948, "grad_norm": 0.621304857054625, "learning_rate": 1.2010384965308934e-05, "loss": 0.0681, "step": 8197 }, { "epoch": 0.6382560459345044, "grad_norm": 0.7934069406886337, "learning_rate": 1.2005809018741603e-05, "loss": 0.0987, "step": 8198 }, { "epoch": 0.6383339010267141, "grad_norm": 0.62307877487919, "learning_rate": 1.2001233570174792e-05, "loss": 0.0758, "step": 8199 }, { "epoch": 0.6384117561189236, "grad_norm": 0.6467587409568423, "learning_rate": 1.1996658619893532e-05, "loss": 0.0714, "step": 8200 }, { "epoch": 0.6384117561189236, "eval_loss": 0.008694063872098923, "eval_runtime": 162.3857, "eval_samples_per_second": 17.736, "eval_steps_per_second": 0.634, "step": 8200 }, { "epoch": 0.6384896112111332, "grad_norm": 0.5534737687544721, "learning_rate": 1.1992084168182816e-05, "loss": 0.0566, "step": 8201 }, { "epoch": 0.6385674663033429, "grad_norm": 0.6275986106371128, "learning_rate": 1.1987510215327624e-05, "loss": 0.0789, "step": 8202 }, { "epoch": 0.6386453213955525, "grad_norm": 0.5622376357915808, "learning_rate": 1.1982936761612882e-05, "loss": 0.0652, "step": 8203 }, { "epoch": 0.6387231764877621, "grad_norm": 0.5692827132710095, "learning_rate": 1.1978363807323495e-05, "loss": 0.0644, "step": 8204 }, { "epoch": 0.6388010315799718, "grad_norm": 0.6846865119551867, "learning_rate": 1.1973791352744337e-05, "loss": 0.0785, "step": 8205 }, { "epoch": 0.6388788866721814, "grad_norm": 0.49996089979898567, "learning_rate": 1.1969219398160252e-05, "loss": 0.0574, "step": 8206 }, { "epoch": 0.638956741764391, "grad_norm": 0.6142077508929299, "learning_rate": 1.1964647943856048e-05, "loss": 0.0875, "step": 8207 }, { "epoch": 0.6390345968566007, "grad_norm": 0.7528061988286228, "learning_rate": 1.196007699011651e-05, "loss": 0.0799, "step": 8208 }, { "epoch": 0.6391124519488103, "grad_norm": 0.5917086332386865, "learning_rate": 1.1955506537226385e-05, "loss": 0.0479, "step": 8209 }, { "epoch": 0.6391903070410199, "grad_norm": 0.6714296015254736, "learning_rate": 1.1950936585470389e-05, "loss": 0.0758, "step": 8210 }, { "epoch": 0.6392681621332296, "grad_norm": 0.6219801377743742, "learning_rate": 1.194636713513321e-05, "loss": 0.0751, "step": 8211 }, { "epoch": 0.6393460172254392, "grad_norm": 0.644436825552405, "learning_rate": 1.19417981864995e-05, "loss": 0.0861, "step": 8212 }, { "epoch": 0.6394238723176487, "grad_norm": 0.6658905522596587, "learning_rate": 1.193722973985389e-05, "loss": 0.0713, "step": 8213 }, { "epoch": 0.6395017274098584, "grad_norm": 0.6185965530962764, "learning_rate": 1.1932661795480969e-05, "loss": 0.0628, "step": 8214 }, { "epoch": 0.639579582502068, "grad_norm": 0.5948156542110966, "learning_rate": 1.1928094353665299e-05, "loss": 0.0637, "step": 8215 }, { "epoch": 0.6396574375942776, "grad_norm": 0.6767520532385212, "learning_rate": 1.192352741469141e-05, "loss": 0.0755, "step": 8216 }, { "epoch": 0.6397352926864873, "grad_norm": 0.5754438184661061, "learning_rate": 1.1918960978843792e-05, "loss": 0.0606, "step": 8217 }, { "epoch": 0.6398131477786969, "grad_norm": 0.6079880723228686, "learning_rate": 1.1914395046406928e-05, "loss": 0.0523, "step": 8218 }, { "epoch": 0.6398910028709065, "grad_norm": 0.538105932253008, "learning_rate": 1.1909829617665248e-05, "loss": 0.045, "step": 8219 }, { "epoch": 0.6399688579631162, "grad_norm": 0.6113377447268553, "learning_rate": 1.1905264692903154e-05, "loss": 0.0499, "step": 8220 }, { "epoch": 0.6400467130553258, "grad_norm": 0.6457082110011158, "learning_rate": 1.190070027240502e-05, "loss": 0.0666, "step": 8221 }, { "epoch": 0.6401245681475354, "grad_norm": 0.5417740140445211, "learning_rate": 1.1896136356455186e-05, "loss": 0.0511, "step": 8222 }, { "epoch": 0.6402024232397451, "grad_norm": 0.6701280360098588, "learning_rate": 1.189157294533797e-05, "loss": 0.0702, "step": 8223 }, { "epoch": 0.6402802783319547, "grad_norm": 0.576320205341206, "learning_rate": 1.1887010039337646e-05, "loss": 0.062, "step": 8224 }, { "epoch": 0.6403581334241643, "grad_norm": 0.5698147729017556, "learning_rate": 1.1882447638738462e-05, "loss": 0.0513, "step": 8225 }, { "epoch": 0.6404359885163738, "grad_norm": 0.5907439626629665, "learning_rate": 1.1877885743824635e-05, "loss": 0.0573, "step": 8226 }, { "epoch": 0.6405138436085835, "grad_norm": 0.6359150230505397, "learning_rate": 1.1873324354880345e-05, "loss": 0.0654, "step": 8227 }, { "epoch": 0.6405916987007931, "grad_norm": 0.6504233672476482, "learning_rate": 1.1868763472189753e-05, "loss": 0.061, "step": 8228 }, { "epoch": 0.6406695537930027, "grad_norm": 0.5595351086686751, "learning_rate": 1.1864203096036974e-05, "loss": 0.0522, "step": 8229 }, { "epoch": 0.6407474088852124, "grad_norm": 0.706459286947123, "learning_rate": 1.1859643226706102e-05, "loss": 0.101, "step": 8230 }, { "epoch": 0.640825263977422, "grad_norm": 0.635744645152967, "learning_rate": 1.1855083864481194e-05, "loss": 0.075, "step": 8231 }, { "epoch": 0.6409031190696316, "grad_norm": 0.5672891268344243, "learning_rate": 1.1850525009646275e-05, "loss": 0.0556, "step": 8232 }, { "epoch": 0.6409809741618413, "grad_norm": 0.6045582761485249, "learning_rate": 1.1845966662485342e-05, "loss": 0.0767, "step": 8233 }, { "epoch": 0.6410588292540509, "grad_norm": 0.6934143969009406, "learning_rate": 1.184140882328236e-05, "loss": 0.078, "step": 8234 }, { "epoch": 0.6411366843462605, "grad_norm": 0.5544959130044219, "learning_rate": 1.183685149232126e-05, "loss": 0.0551, "step": 8235 }, { "epoch": 0.6412145394384702, "grad_norm": 0.6435635414104757, "learning_rate": 1.1832294669885944e-05, "loss": 0.063, "step": 8236 }, { "epoch": 0.6412923945306798, "grad_norm": 0.6149578881786506, "learning_rate": 1.1827738356260271e-05, "loss": 0.0637, "step": 8237 }, { "epoch": 0.6413702496228894, "grad_norm": 0.554937049025306, "learning_rate": 1.1823182551728091e-05, "loss": 0.0538, "step": 8238 }, { "epoch": 0.641448104715099, "grad_norm": 0.7178937874077711, "learning_rate": 1.1818627256573203e-05, "loss": 0.0991, "step": 8239 }, { "epoch": 0.6415259598073086, "grad_norm": 0.4957999266971416, "learning_rate": 1.1814072471079384e-05, "loss": 0.0457, "step": 8240 }, { "epoch": 0.6416038148995182, "grad_norm": 0.5422632162489378, "learning_rate": 1.180951819553037e-05, "loss": 0.0563, "step": 8241 }, { "epoch": 0.6416816699917279, "grad_norm": 0.7194428365365939, "learning_rate": 1.1804964430209868e-05, "loss": 0.0868, "step": 8242 }, { "epoch": 0.6417595250839375, "grad_norm": 0.5973921004002077, "learning_rate": 1.180041117540157e-05, "loss": 0.0395, "step": 8243 }, { "epoch": 0.6418373801761471, "grad_norm": 0.5969555616668908, "learning_rate": 1.1795858431389111e-05, "loss": 0.0787, "step": 8244 }, { "epoch": 0.6419152352683568, "grad_norm": 0.5818827271855898, "learning_rate": 1.179130619845611e-05, "loss": 0.0577, "step": 8245 }, { "epoch": 0.6419930903605664, "grad_norm": 0.5820430579827003, "learning_rate": 1.1786754476886149e-05, "loss": 0.047, "step": 8246 }, { "epoch": 0.642070945452776, "grad_norm": 0.6746161025934241, "learning_rate": 1.1782203266962781e-05, "loss": 0.0822, "step": 8247 }, { "epoch": 0.6421488005449857, "grad_norm": 0.50441998140526, "learning_rate": 1.1777652568969511e-05, "loss": 0.0507, "step": 8248 }, { "epoch": 0.6422266556371953, "grad_norm": 0.5452499893645929, "learning_rate": 1.1773102383189844e-05, "loss": 0.0531, "step": 8249 }, { "epoch": 0.6423045107294049, "grad_norm": 0.4501275408862246, "learning_rate": 1.1768552709907232e-05, "loss": 0.0334, "step": 8250 }, { "epoch": 0.6423045107294049, "eval_loss": 0.008634491823613644, "eval_runtime": 162.4255, "eval_samples_per_second": 17.731, "eval_steps_per_second": 0.634, "step": 8250 }, { "epoch": 0.6423823658216146, "grad_norm": 0.5858211492141494, "learning_rate": 1.1764003549405098e-05, "loss": 0.0604, "step": 8251 }, { "epoch": 0.6424602209138242, "grad_norm": 0.6710916613113765, "learning_rate": 1.1759454901966824e-05, "loss": 0.0703, "step": 8252 }, { "epoch": 0.6425380760060337, "grad_norm": 0.661559665631783, "learning_rate": 1.1754906767875771e-05, "loss": 0.065, "step": 8253 }, { "epoch": 0.6426159310982434, "grad_norm": 0.5627517940489912, "learning_rate": 1.1750359147415278e-05, "loss": 0.0637, "step": 8254 }, { "epoch": 0.642693786190453, "grad_norm": 0.5398849877444197, "learning_rate": 1.1745812040868633e-05, "loss": 0.0549, "step": 8255 }, { "epoch": 0.6427716412826626, "grad_norm": 0.5823626413989642, "learning_rate": 1.1741265448519104e-05, "loss": 0.0735, "step": 8256 }, { "epoch": 0.6428494963748723, "grad_norm": 0.6959346437544808, "learning_rate": 1.1736719370649914e-05, "loss": 0.0754, "step": 8257 }, { "epoch": 0.6429273514670819, "grad_norm": 0.5744871155765136, "learning_rate": 1.1732173807544256e-05, "loss": 0.0634, "step": 8258 }, { "epoch": 0.6430052065592915, "grad_norm": 0.593458561657028, "learning_rate": 1.1727628759485319e-05, "loss": 0.0594, "step": 8259 }, { "epoch": 0.6430830616515012, "grad_norm": 0.5810455790897227, "learning_rate": 1.172308422675623e-05, "loss": 0.0683, "step": 8260 }, { "epoch": 0.6431609167437108, "grad_norm": 0.6910979891708116, "learning_rate": 1.1718540209640084e-05, "loss": 0.0903, "step": 8261 }, { "epoch": 0.6432387718359204, "grad_norm": 0.5537435605979307, "learning_rate": 1.1713996708419955e-05, "loss": 0.0497, "step": 8262 }, { "epoch": 0.64331662692813, "grad_norm": 0.6617389608489939, "learning_rate": 1.1709453723378878e-05, "loss": 0.071, "step": 8263 }, { "epoch": 0.6433944820203397, "grad_norm": 0.662194764731695, "learning_rate": 1.1704911254799871e-05, "loss": 0.0664, "step": 8264 }, { "epoch": 0.6434723371125493, "grad_norm": 0.6547157003490305, "learning_rate": 1.170036930296591e-05, "loss": 0.0594, "step": 8265 }, { "epoch": 0.6435501922047588, "grad_norm": 0.6432118687899625, "learning_rate": 1.1695827868159922e-05, "loss": 0.0723, "step": 8266 }, { "epoch": 0.6436280472969685, "grad_norm": 0.5574250943770336, "learning_rate": 1.1691286950664826e-05, "loss": 0.0442, "step": 8267 }, { "epoch": 0.6437059023891781, "grad_norm": 0.6306013787950523, "learning_rate": 1.1686746550763491e-05, "loss": 0.0728, "step": 8268 }, { "epoch": 0.6437837574813877, "grad_norm": 0.5381322302851809, "learning_rate": 1.1682206668738783e-05, "loss": 0.0615, "step": 8269 }, { "epoch": 0.6438616125735974, "grad_norm": 0.6091093490196181, "learning_rate": 1.1677667304873497e-05, "loss": 0.0661, "step": 8270 }, { "epoch": 0.643939467665807, "grad_norm": 0.5548557471467336, "learning_rate": 1.1673128459450415e-05, "loss": 0.0655, "step": 8271 }, { "epoch": 0.6440173227580166, "grad_norm": 0.6136060746064546, "learning_rate": 1.1668590132752295e-05, "loss": 0.0583, "step": 8272 }, { "epoch": 0.6440951778502263, "grad_norm": 0.5485740045727076, "learning_rate": 1.1664052325061847e-05, "loss": 0.0532, "step": 8273 }, { "epoch": 0.6441730329424359, "grad_norm": 0.590259842989755, "learning_rate": 1.165951503666176e-05, "loss": 0.0764, "step": 8274 }, { "epoch": 0.6442508880346455, "grad_norm": 0.5769999090619505, "learning_rate": 1.165497826783468e-05, "loss": 0.0635, "step": 8275 }, { "epoch": 0.6443287431268552, "grad_norm": 0.6978059116985289, "learning_rate": 1.165044201886323e-05, "loss": 0.0724, "step": 8276 }, { "epoch": 0.6444065982190648, "grad_norm": 0.6660119632213293, "learning_rate": 1.1645906290029996e-05, "loss": 0.07, "step": 8277 }, { "epoch": 0.6444844533112744, "grad_norm": 0.6345818663004773, "learning_rate": 1.1641371081617522e-05, "loss": 0.0709, "step": 8278 }, { "epoch": 0.644562308403484, "grad_norm": 0.6850209245303696, "learning_rate": 1.1636836393908353e-05, "loss": 0.1052, "step": 8279 }, { "epoch": 0.6446401634956936, "grad_norm": 0.5698750158510586, "learning_rate": 1.1632302227184962e-05, "loss": 0.0736, "step": 8280 }, { "epoch": 0.6447180185879032, "grad_norm": 0.6430596490182234, "learning_rate": 1.1627768581729812e-05, "loss": 0.0626, "step": 8281 }, { "epoch": 0.6447958736801129, "grad_norm": 0.5507581169359953, "learning_rate": 1.1623235457825336e-05, "loss": 0.074, "step": 8282 }, { "epoch": 0.6448737287723225, "grad_norm": 0.5539510397601412, "learning_rate": 1.1618702855753907e-05, "loss": 0.0518, "step": 8283 }, { "epoch": 0.6449515838645321, "grad_norm": 0.5654985044330056, "learning_rate": 1.1614170775797886e-05, "loss": 0.0575, "step": 8284 }, { "epoch": 0.6450294389567418, "grad_norm": 0.5608040199091252, "learning_rate": 1.160963921823962e-05, "loss": 0.0502, "step": 8285 }, { "epoch": 0.6451072940489514, "grad_norm": 0.6664796016755296, "learning_rate": 1.1605108183361388e-05, "loss": 0.0827, "step": 8286 }, { "epoch": 0.645185149141161, "grad_norm": 0.7083956928767592, "learning_rate": 1.1600577671445461e-05, "loss": 0.113, "step": 8287 }, { "epoch": 0.6452630042333707, "grad_norm": 0.5971971102757737, "learning_rate": 1.159604768277406e-05, "loss": 0.0711, "step": 8288 }, { "epoch": 0.6453408593255803, "grad_norm": 0.5260341688103463, "learning_rate": 1.1591518217629383e-05, "loss": 0.0283, "step": 8289 }, { "epoch": 0.6454187144177899, "grad_norm": 0.5666183234152492, "learning_rate": 1.1586989276293603e-05, "loss": 0.0638, "step": 8290 }, { "epoch": 0.6454965695099996, "grad_norm": 0.5841830536651179, "learning_rate": 1.1582460859048857e-05, "loss": 0.0495, "step": 8291 }, { "epoch": 0.6455744246022092, "grad_norm": 0.6149969261906433, "learning_rate": 1.157793296617722e-05, "loss": 0.0547, "step": 8292 }, { "epoch": 0.6456522796944187, "grad_norm": 0.6058474455803357, "learning_rate": 1.1573405597960777e-05, "loss": 0.0717, "step": 8293 }, { "epoch": 0.6457301347866284, "grad_norm": 0.6120439139760578, "learning_rate": 1.1568878754681547e-05, "loss": 0.0494, "step": 8294 }, { "epoch": 0.645807989878838, "grad_norm": 0.6471509142309779, "learning_rate": 1.1564352436621548e-05, "loss": 0.0674, "step": 8295 }, { "epoch": 0.6458858449710476, "grad_norm": 0.5503615685846975, "learning_rate": 1.1559826644062743e-05, "loss": 0.0529, "step": 8296 }, { "epoch": 0.6459637000632573, "grad_norm": 0.5263567155491886, "learning_rate": 1.1555301377287064e-05, "loss": 0.0593, "step": 8297 }, { "epoch": 0.6460415551554669, "grad_norm": 0.5651336504508981, "learning_rate": 1.1550776636576418e-05, "loss": 0.0504, "step": 8298 }, { "epoch": 0.6461194102476765, "grad_norm": 0.5291522424271803, "learning_rate": 1.154625242221266e-05, "loss": 0.0553, "step": 8299 }, { "epoch": 0.6461972653398861, "grad_norm": 0.5814281135781739, "learning_rate": 1.1541728734477662e-05, "loss": 0.0551, "step": 8300 }, { "epoch": 0.6461972653398861, "eval_loss": 0.008570182137191296, "eval_runtime": 162.6676, "eval_samples_per_second": 17.705, "eval_steps_per_second": 0.633, "step": 8300 }, { "epoch": 0.6462751204320958, "grad_norm": 0.6706921819180925, "learning_rate": 1.1537205573653195e-05, "loss": 0.0816, "step": 8301 }, { "epoch": 0.6463529755243054, "grad_norm": 0.5726951252655427, "learning_rate": 1.1532682940021042e-05, "loss": 0.0538, "step": 8302 }, { "epoch": 0.646430830616515, "grad_norm": 0.5850791696786395, "learning_rate": 1.1528160833862943e-05, "loss": 0.0432, "step": 8303 }, { "epoch": 0.6465086857087247, "grad_norm": 0.6760761476305608, "learning_rate": 1.1523639255460593e-05, "loss": 0.0598, "step": 8304 }, { "epoch": 0.6465865408009343, "grad_norm": 0.5962887429679735, "learning_rate": 1.1519118205095684e-05, "loss": 0.0636, "step": 8305 }, { "epoch": 0.6466643958931438, "grad_norm": 0.7289629596003884, "learning_rate": 1.1514597683049849e-05, "loss": 0.0904, "step": 8306 }, { "epoch": 0.6467422509853535, "grad_norm": 0.6490524156344689, "learning_rate": 1.151007768960469e-05, "loss": 0.0738, "step": 8307 }, { "epoch": 0.6468201060775631, "grad_norm": 0.6358591698528073, "learning_rate": 1.150555822504179e-05, "loss": 0.0828, "step": 8308 }, { "epoch": 0.6468979611697727, "grad_norm": 0.533192565370264, "learning_rate": 1.150103928964268e-05, "loss": 0.0619, "step": 8309 }, { "epoch": 0.6469758162619824, "grad_norm": 0.5010001644350972, "learning_rate": 1.1496520883688875e-05, "loss": 0.0548, "step": 8310 }, { "epoch": 0.647053671354192, "grad_norm": 0.557584914364735, "learning_rate": 1.149200300746185e-05, "loss": 0.0604, "step": 8311 }, { "epoch": 0.6471315264464016, "grad_norm": 0.5466402140265548, "learning_rate": 1.1487485661243047e-05, "loss": 0.0604, "step": 8312 }, { "epoch": 0.6472093815386113, "grad_norm": 0.6076287962262404, "learning_rate": 1.1482968845313876e-05, "loss": 0.0669, "step": 8313 }, { "epoch": 0.6472872366308209, "grad_norm": 0.8265803811505534, "learning_rate": 1.14784525599557e-05, "loss": 0.0955, "step": 8314 }, { "epoch": 0.6473650917230305, "grad_norm": 0.5653660577958717, "learning_rate": 1.1473936805449886e-05, "loss": 0.0463, "step": 8315 }, { "epoch": 0.6474429468152402, "grad_norm": 0.5865542213725148, "learning_rate": 1.1469421582077733e-05, "loss": 0.0714, "step": 8316 }, { "epoch": 0.6475208019074498, "grad_norm": 0.5960803730733424, "learning_rate": 1.1464906890120514e-05, "loss": 0.0635, "step": 8317 }, { "epoch": 0.6475986569996594, "grad_norm": 0.5238414951537046, "learning_rate": 1.1460392729859482e-05, "loss": 0.0537, "step": 8318 }, { "epoch": 0.647676512091869, "grad_norm": 0.6535446508744787, "learning_rate": 1.1455879101575845e-05, "loss": 0.0607, "step": 8319 }, { "epoch": 0.6477543671840786, "grad_norm": 0.7275092302165334, "learning_rate": 1.1451366005550772e-05, "loss": 0.1024, "step": 8320 }, { "epoch": 0.6478322222762882, "grad_norm": 0.5580461338698095, "learning_rate": 1.144685344206542e-05, "loss": 0.0603, "step": 8321 }, { "epoch": 0.6479100773684979, "grad_norm": 0.6784899247143605, "learning_rate": 1.1442341411400895e-05, "loss": 0.0747, "step": 8322 }, { "epoch": 0.6479879324607075, "grad_norm": 0.517016002040269, "learning_rate": 1.1437829913838275e-05, "loss": 0.0478, "step": 8323 }, { "epoch": 0.6480657875529171, "grad_norm": 0.543706462738846, "learning_rate": 1.1433318949658602e-05, "loss": 0.0586, "step": 8324 }, { "epoch": 0.6481436426451268, "grad_norm": 0.6064631935151432, "learning_rate": 1.1428808519142888e-05, "loss": 0.0685, "step": 8325 }, { "epoch": 0.6482214977373364, "grad_norm": 0.7266367640822475, "learning_rate": 1.1424298622572121e-05, "loss": 0.0772, "step": 8326 }, { "epoch": 0.648299352829546, "grad_norm": 0.6601176188407998, "learning_rate": 1.1419789260227241e-05, "loss": 0.0839, "step": 8327 }, { "epoch": 0.6483772079217557, "grad_norm": 0.5362326831085052, "learning_rate": 1.141528043238916e-05, "loss": 0.0508, "step": 8328 }, { "epoch": 0.6484550630139653, "grad_norm": 0.6219657135258716, "learning_rate": 1.1410772139338757e-05, "loss": 0.0635, "step": 8329 }, { "epoch": 0.6485329181061749, "grad_norm": 0.5883255843757633, "learning_rate": 1.1406264381356875e-05, "loss": 0.0542, "step": 8330 }, { "epoch": 0.6486107731983846, "grad_norm": 0.6581835432474665, "learning_rate": 1.1401757158724327e-05, "loss": 0.0748, "step": 8331 }, { "epoch": 0.6486886282905941, "grad_norm": 0.6164226344569675, "learning_rate": 1.1397250471721895e-05, "loss": 0.0776, "step": 8332 }, { "epoch": 0.6487664833828037, "grad_norm": 0.6149403063963246, "learning_rate": 1.139274432063032e-05, "loss": 0.0632, "step": 8333 }, { "epoch": 0.6488443384750133, "grad_norm": 0.6575614539400583, "learning_rate": 1.138823870573032e-05, "loss": 0.0683, "step": 8334 }, { "epoch": 0.648922193567223, "grad_norm": 0.6090773964547115, "learning_rate": 1.1383733627302558e-05, "loss": 0.0587, "step": 8335 }, { "epoch": 0.6490000486594326, "grad_norm": 0.48901016031495304, "learning_rate": 1.1379229085627701e-05, "loss": 0.0461, "step": 8336 }, { "epoch": 0.6490779037516422, "grad_norm": 0.6101330751512668, "learning_rate": 1.1374725080986351e-05, "loss": 0.0643, "step": 8337 }, { "epoch": 0.6491557588438519, "grad_norm": 0.5480370862580513, "learning_rate": 1.1370221613659083e-05, "loss": 0.0609, "step": 8338 }, { "epoch": 0.6492336139360615, "grad_norm": 0.5513006294167568, "learning_rate": 1.1365718683926448e-05, "loss": 0.0566, "step": 8339 }, { "epoch": 0.6493114690282711, "grad_norm": 0.6392341547310377, "learning_rate": 1.1361216292068954e-05, "loss": 0.0764, "step": 8340 }, { "epoch": 0.6493893241204808, "grad_norm": 0.5845749739578475, "learning_rate": 1.1356714438367078e-05, "loss": 0.0572, "step": 8341 }, { "epoch": 0.6494671792126904, "grad_norm": 0.7231632197134122, "learning_rate": 1.1352213123101265e-05, "loss": 0.0813, "step": 8342 }, { "epoch": 0.6495450343049, "grad_norm": 0.6142919814683276, "learning_rate": 1.1347712346551929e-05, "loss": 0.0608, "step": 8343 }, { "epoch": 0.6496228893971097, "grad_norm": 0.7175945534532959, "learning_rate": 1.1343212108999442e-05, "loss": 0.0895, "step": 8344 }, { "epoch": 0.6497007444893192, "grad_norm": 0.6853093362683967, "learning_rate": 1.1338712410724141e-05, "loss": 0.1027, "step": 8345 }, { "epoch": 0.6497785995815288, "grad_norm": 0.6398125555671224, "learning_rate": 1.1334213252006357e-05, "loss": 0.0941, "step": 8346 }, { "epoch": 0.6498564546737385, "grad_norm": 0.5725993777204521, "learning_rate": 1.1329714633126351e-05, "loss": 0.0454, "step": 8347 }, { "epoch": 0.6499343097659481, "grad_norm": 0.6193546022068335, "learning_rate": 1.1325216554364372e-05, "loss": 0.0685, "step": 8348 }, { "epoch": 0.6500121648581577, "grad_norm": 0.5878379258246329, "learning_rate": 1.1320719016000634e-05, "loss": 0.0567, "step": 8349 }, { "epoch": 0.6500900199503674, "grad_norm": 0.5127091149446035, "learning_rate": 1.1316222018315285e-05, "loss": 0.0504, "step": 8350 }, { "epoch": 0.6500900199503674, "eval_loss": 0.008384584449231625, "eval_runtime": 167.8504, "eval_samples_per_second": 17.158, "eval_steps_per_second": 0.614, "step": 8350 }, { "epoch": 0.650167875042577, "grad_norm": 0.6677166270796937, "learning_rate": 1.1311725561588501e-05, "loss": 0.0902, "step": 8351 }, { "epoch": 0.6502457301347866, "grad_norm": 0.5974993043831076, "learning_rate": 1.1307229646100373e-05, "loss": 0.0718, "step": 8352 }, { "epoch": 0.6503235852269963, "grad_norm": 0.6745954237773502, "learning_rate": 1.130273427213098e-05, "loss": 0.0856, "step": 8353 }, { "epoch": 0.6504014403192059, "grad_norm": 0.6168307008273988, "learning_rate": 1.129823943996036e-05, "loss": 0.078, "step": 8354 }, { "epoch": 0.6504792954114155, "grad_norm": 0.648465343865503, "learning_rate": 1.1293745149868514e-05, "loss": 0.0599, "step": 8355 }, { "epoch": 0.6505571505036252, "grad_norm": 0.679606861831546, "learning_rate": 1.1289251402135429e-05, "loss": 0.0852, "step": 8356 }, { "epoch": 0.6506350055958348, "grad_norm": 0.5300745178660822, "learning_rate": 1.1284758197041037e-05, "loss": 0.0623, "step": 8357 }, { "epoch": 0.6507128606880443, "grad_norm": 0.7024081168363988, "learning_rate": 1.1280265534865253e-05, "loss": 0.1024, "step": 8358 }, { "epoch": 0.650790715780254, "grad_norm": 0.49674280456215586, "learning_rate": 1.127577341588793e-05, "loss": 0.0507, "step": 8359 }, { "epoch": 0.6508685708724636, "grad_norm": 0.5786846864709952, "learning_rate": 1.1271281840388917e-05, "loss": 0.0629, "step": 8360 }, { "epoch": 0.6509464259646732, "grad_norm": 0.5343136700159435, "learning_rate": 1.1266790808648009e-05, "loss": 0.0438, "step": 8361 }, { "epoch": 0.6510242810568829, "grad_norm": 0.5989094911229577, "learning_rate": 1.1262300320944992e-05, "loss": 0.0735, "step": 8362 }, { "epoch": 0.6511021361490925, "grad_norm": 0.5934706344009423, "learning_rate": 1.1257810377559595e-05, "loss": 0.0615, "step": 8363 }, { "epoch": 0.6511799912413021, "grad_norm": 0.5863940021555453, "learning_rate": 1.125332097877152e-05, "loss": 0.0737, "step": 8364 }, { "epoch": 0.6512578463335118, "grad_norm": 0.5196545433674872, "learning_rate": 1.1248832124860431e-05, "loss": 0.0509, "step": 8365 }, { "epoch": 0.6513357014257214, "grad_norm": 0.6515233113401555, "learning_rate": 1.1244343816105963e-05, "loss": 0.0689, "step": 8366 }, { "epoch": 0.651413556517931, "grad_norm": 0.6958881789241171, "learning_rate": 1.1239856052787732e-05, "loss": 0.0936, "step": 8367 }, { "epoch": 0.6514914116101407, "grad_norm": 0.6668322213925453, "learning_rate": 1.1235368835185287e-05, "loss": 0.089, "step": 8368 }, { "epoch": 0.6515692667023503, "grad_norm": 0.6111107792603285, "learning_rate": 1.1230882163578168e-05, "loss": 0.0634, "step": 8369 }, { "epoch": 0.6516471217945599, "grad_norm": 0.4956310030067201, "learning_rate": 1.1226396038245867e-05, "loss": 0.0419, "step": 8370 }, { "epoch": 0.6517249768867694, "grad_norm": 0.5378930540387761, "learning_rate": 1.1221910459467851e-05, "loss": 0.0624, "step": 8371 }, { "epoch": 0.6518028319789791, "grad_norm": 0.6210764074819324, "learning_rate": 1.1217425427523558e-05, "loss": 0.0691, "step": 8372 }, { "epoch": 0.6518806870711887, "grad_norm": 0.5834582192569991, "learning_rate": 1.1212940942692376e-05, "loss": 0.0647, "step": 8373 }, { "epoch": 0.6519585421633983, "grad_norm": 0.5627695307346177, "learning_rate": 1.1208457005253675e-05, "loss": 0.0557, "step": 8374 }, { "epoch": 0.652036397255608, "grad_norm": 0.5568752962257366, "learning_rate": 1.1203973615486777e-05, "loss": 0.0543, "step": 8375 }, { "epoch": 0.6521142523478176, "grad_norm": 0.5557601566469973, "learning_rate": 1.1199490773670978e-05, "loss": 0.0486, "step": 8376 }, { "epoch": 0.6521921074400272, "grad_norm": 0.7940251896710196, "learning_rate": 1.1195008480085536e-05, "loss": 0.0953, "step": 8377 }, { "epoch": 0.6522699625322369, "grad_norm": 0.4767112843106506, "learning_rate": 1.1190526735009679e-05, "loss": 0.0373, "step": 8378 }, { "epoch": 0.6523478176244465, "grad_norm": 0.597520478297421, "learning_rate": 1.1186045538722601e-05, "loss": 0.0636, "step": 8379 }, { "epoch": 0.6524256727166561, "grad_norm": 0.542072816926786, "learning_rate": 1.1181564891503456e-05, "loss": 0.0567, "step": 8380 }, { "epoch": 0.6525035278088658, "grad_norm": 0.5484469467409409, "learning_rate": 1.1177084793631358e-05, "loss": 0.0632, "step": 8381 }, { "epoch": 0.6525813829010754, "grad_norm": 0.5845373445112796, "learning_rate": 1.1172605245385415e-05, "loss": 0.0643, "step": 8382 }, { "epoch": 0.652659237993285, "grad_norm": 0.6781787647309818, "learning_rate": 1.1168126247044672e-05, "loss": 0.0976, "step": 8383 }, { "epoch": 0.6527370930854947, "grad_norm": 0.6612522683230376, "learning_rate": 1.1163647798888151e-05, "loss": 0.0774, "step": 8384 }, { "epoch": 0.6528149481777042, "grad_norm": 0.574539091472847, "learning_rate": 1.1159169901194838e-05, "loss": 0.0628, "step": 8385 }, { "epoch": 0.6528928032699138, "grad_norm": 0.5614999126878779, "learning_rate": 1.1154692554243685e-05, "loss": 0.0546, "step": 8386 }, { "epoch": 0.6529706583621235, "grad_norm": 0.5586917739941243, "learning_rate": 1.1150215758313607e-05, "loss": 0.0475, "step": 8387 }, { "epoch": 0.6530485134543331, "grad_norm": 0.5652897891798986, "learning_rate": 1.114573951368349e-05, "loss": 0.0529, "step": 8388 }, { "epoch": 0.6531263685465427, "grad_norm": 0.6006768143034645, "learning_rate": 1.1141263820632184e-05, "loss": 0.0766, "step": 8389 }, { "epoch": 0.6532042236387524, "grad_norm": 0.5513304907673666, "learning_rate": 1.1136788679438504e-05, "loss": 0.0566, "step": 8390 }, { "epoch": 0.653282078730962, "grad_norm": 0.6830594329672822, "learning_rate": 1.1132314090381216e-05, "loss": 0.0644, "step": 8391 }, { "epoch": 0.6533599338231716, "grad_norm": 0.5595052654018837, "learning_rate": 1.112784005373909e-05, "loss": 0.0611, "step": 8392 }, { "epoch": 0.6534377889153813, "grad_norm": 0.5357617359349689, "learning_rate": 1.1123366569790823e-05, "loss": 0.0509, "step": 8393 }, { "epoch": 0.6535156440075909, "grad_norm": 0.5329318158870769, "learning_rate": 1.1118893638815095e-05, "loss": 0.062, "step": 8394 }, { "epoch": 0.6535934990998005, "grad_norm": 0.6020720176604994, "learning_rate": 1.1114421261090551e-05, "loss": 0.0662, "step": 8395 }, { "epoch": 0.6536713541920102, "grad_norm": 0.5006471319581751, "learning_rate": 1.1109949436895797e-05, "loss": 0.052, "step": 8396 }, { "epoch": 0.6537492092842198, "grad_norm": 0.6311442919323764, "learning_rate": 1.1105478166509402e-05, "loss": 0.0792, "step": 8397 }, { "epoch": 0.6538270643764293, "grad_norm": 0.5073478362775586, "learning_rate": 1.1101007450209912e-05, "loss": 0.047, "step": 8398 }, { "epoch": 0.653904919468639, "grad_norm": 0.722604315189617, "learning_rate": 1.109653728827583e-05, "loss": 0.108, "step": 8399 }, { "epoch": 0.6539827745608486, "grad_norm": 0.7100132019829513, "learning_rate": 1.1092067680985626e-05, "loss": 0.0796, "step": 8400 }, { "epoch": 0.6539827745608486, "eval_loss": 0.00826586876064539, "eval_runtime": 162.6596, "eval_samples_per_second": 17.706, "eval_steps_per_second": 0.633, "step": 8400 }, { "epoch": 0.6540606296530582, "grad_norm": 0.5355637184452684, "learning_rate": 1.1087598628617728e-05, "loss": 0.0507, "step": 8401 }, { "epoch": 0.6541384847452679, "grad_norm": 0.5916903295356128, "learning_rate": 1.1083130131450541e-05, "loss": 0.0683, "step": 8402 }, { "epoch": 0.6542163398374775, "grad_norm": 0.5837118711875133, "learning_rate": 1.1078662189762442e-05, "loss": 0.0634, "step": 8403 }, { "epoch": 0.6542941949296871, "grad_norm": 0.5297447100704326, "learning_rate": 1.1074194803831753e-05, "loss": 0.0577, "step": 8404 }, { "epoch": 0.6543720500218967, "grad_norm": 0.5979444336163994, "learning_rate": 1.1069727973936773e-05, "loss": 0.0726, "step": 8405 }, { "epoch": 0.6544499051141064, "grad_norm": 0.6494537374287795, "learning_rate": 1.1065261700355762e-05, "loss": 0.0748, "step": 8406 }, { "epoch": 0.654527760206316, "grad_norm": 0.5865371772269904, "learning_rate": 1.1060795983366953e-05, "loss": 0.0443, "step": 8407 }, { "epoch": 0.6546056152985256, "grad_norm": 0.5791744842356407, "learning_rate": 1.1056330823248533e-05, "loss": 0.0609, "step": 8408 }, { "epoch": 0.6546834703907353, "grad_norm": 0.5509332106592134, "learning_rate": 1.1051866220278669e-05, "loss": 0.0586, "step": 8409 }, { "epoch": 0.6547613254829449, "grad_norm": 0.5533397968183841, "learning_rate": 1.1047402174735476e-05, "loss": 0.0536, "step": 8410 }, { "epoch": 0.6548391805751544, "grad_norm": 0.6281090005920791, "learning_rate": 1.1042938686897047e-05, "loss": 0.0852, "step": 8411 }, { "epoch": 0.6549170356673641, "grad_norm": 0.6557357936651018, "learning_rate": 1.1038475757041427e-05, "loss": 0.0798, "step": 8412 }, { "epoch": 0.6549948907595737, "grad_norm": 0.5712004221925043, "learning_rate": 1.1034013385446654e-05, "loss": 0.0608, "step": 8413 }, { "epoch": 0.6550727458517833, "grad_norm": 0.6293497178572413, "learning_rate": 1.1029551572390704e-05, "loss": 0.0805, "step": 8414 }, { "epoch": 0.655150600943993, "grad_norm": 0.6942147029681921, "learning_rate": 1.1025090318151523e-05, "loss": 0.0879, "step": 8415 }, { "epoch": 0.6552284560362026, "grad_norm": 0.6076969069239893, "learning_rate": 1.102062962300704e-05, "loss": 0.0749, "step": 8416 }, { "epoch": 0.6553063111284122, "grad_norm": 0.6587712821372536, "learning_rate": 1.1016169487235108e-05, "loss": 0.0694, "step": 8417 }, { "epoch": 0.6553841662206219, "grad_norm": 0.5766817710079681, "learning_rate": 1.1011709911113596e-05, "loss": 0.0565, "step": 8418 }, { "epoch": 0.6554620213128315, "grad_norm": 0.7161403525575413, "learning_rate": 1.100725089492031e-05, "loss": 0.0972, "step": 8419 }, { "epoch": 0.6555398764050411, "grad_norm": 0.6026207812566792, "learning_rate": 1.1002792438933023e-05, "loss": 0.0614, "step": 8420 }, { "epoch": 0.6556177314972508, "grad_norm": 0.5805250530160405, "learning_rate": 1.0998334543429477e-05, "loss": 0.0695, "step": 8421 }, { "epoch": 0.6556955865894604, "grad_norm": 0.5771578199797601, "learning_rate": 1.0993877208687368e-05, "loss": 0.0603, "step": 8422 }, { "epoch": 0.65577344168167, "grad_norm": 0.6634600832497385, "learning_rate": 1.0989420434984383e-05, "loss": 0.0797, "step": 8423 }, { "epoch": 0.6558512967738797, "grad_norm": 0.5751399498504941, "learning_rate": 1.0984964222598155e-05, "loss": 0.0571, "step": 8424 }, { "epoch": 0.6559291518660892, "grad_norm": 0.4548691627983054, "learning_rate": 1.0980508571806287e-05, "loss": 0.0368, "step": 8425 }, { "epoch": 0.6560070069582988, "grad_norm": 0.6246606415861254, "learning_rate": 1.0976053482886332e-05, "loss": 0.0687, "step": 8426 }, { "epoch": 0.6560848620505085, "grad_norm": 0.6407080573128393, "learning_rate": 1.0971598956115823e-05, "loss": 0.0879, "step": 8427 }, { "epoch": 0.6561627171427181, "grad_norm": 0.5195674883657575, "learning_rate": 1.0967144991772268e-05, "loss": 0.0532, "step": 8428 }, { "epoch": 0.6562405722349277, "grad_norm": 0.6157988017557176, "learning_rate": 1.096269159013312e-05, "loss": 0.092, "step": 8429 }, { "epoch": 0.6563184273271374, "grad_norm": 0.5478409932566486, "learning_rate": 1.095823875147581e-05, "loss": 0.0586, "step": 8430 }, { "epoch": 0.656396282419347, "grad_norm": 0.6162763465765093, "learning_rate": 1.0953786476077726e-05, "loss": 0.0694, "step": 8431 }, { "epoch": 0.6564741375115566, "grad_norm": 0.5267745800038587, "learning_rate": 1.094933476421621e-05, "loss": 0.0504, "step": 8432 }, { "epoch": 0.6565519926037663, "grad_norm": 0.5653730117424677, "learning_rate": 1.0944883616168611e-05, "loss": 0.0569, "step": 8433 }, { "epoch": 0.6566298476959759, "grad_norm": 0.5862961382597724, "learning_rate": 1.0940433032212204e-05, "loss": 0.0542, "step": 8434 }, { "epoch": 0.6567077027881855, "grad_norm": 0.5234319516434833, "learning_rate": 1.093598301262423e-05, "loss": 0.0517, "step": 8435 }, { "epoch": 0.6567855578803952, "grad_norm": 0.6136911182978709, "learning_rate": 1.0931533557681906e-05, "loss": 0.0592, "step": 8436 }, { "epoch": 0.6568634129726048, "grad_norm": 0.5958176028251365, "learning_rate": 1.092708466766242e-05, "loss": 0.0527, "step": 8437 }, { "epoch": 0.6569412680648143, "grad_norm": 0.5580127229755368, "learning_rate": 1.0922636342842902e-05, "loss": 0.0635, "step": 8438 }, { "epoch": 0.657019123157024, "grad_norm": 0.47775086888382834, "learning_rate": 1.0918188583500483e-05, "loss": 0.0428, "step": 8439 }, { "epoch": 0.6570969782492336, "grad_norm": 0.5112139385018202, "learning_rate": 1.0913741389912223e-05, "loss": 0.0463, "step": 8440 }, { "epoch": 0.6571748333414432, "grad_norm": 0.6179436107614393, "learning_rate": 1.090929476235517e-05, "loss": 0.0844, "step": 8441 }, { "epoch": 0.6572526884336528, "grad_norm": 0.5189796931926739, "learning_rate": 1.0904848701106318e-05, "loss": 0.0483, "step": 8442 }, { "epoch": 0.6573305435258625, "grad_norm": 0.6061419780416886, "learning_rate": 1.0900403206442644e-05, "loss": 0.0765, "step": 8443 }, { "epoch": 0.6574083986180721, "grad_norm": 0.5136316987733985, "learning_rate": 1.0895958278641077e-05, "loss": 0.0611, "step": 8444 }, { "epoch": 0.6574862537102817, "grad_norm": 0.5556947131513257, "learning_rate": 1.0891513917978517e-05, "loss": 0.0637, "step": 8445 }, { "epoch": 0.6575641088024914, "grad_norm": 0.5521036704004149, "learning_rate": 1.088707012473183e-05, "loss": 0.0467, "step": 8446 }, { "epoch": 0.657641963894701, "grad_norm": 0.5789666742918368, "learning_rate": 1.0882626899177835e-05, "loss": 0.064, "step": 8447 }, { "epoch": 0.6577198189869106, "grad_norm": 0.54303323927964, "learning_rate": 1.0878184241593322e-05, "loss": 0.0595, "step": 8448 }, { "epoch": 0.6577976740791203, "grad_norm": 0.5927600061793181, "learning_rate": 1.0873742152255062e-05, "loss": 0.0636, "step": 8449 }, { "epoch": 0.6578755291713299, "grad_norm": 0.6162653506719934, "learning_rate": 1.086930063143977e-05, "loss": 0.0607, "step": 8450 }, { "epoch": 0.6578755291713299, "eval_loss": 0.008185779675841331, "eval_runtime": 162.1882, "eval_samples_per_second": 17.757, "eval_steps_per_second": 0.635, "step": 8450 }, { "epoch": 0.6579533842635394, "grad_norm": 0.7664201145577104, "learning_rate": 1.0864859679424129e-05, "loss": 0.1024, "step": 8451 }, { "epoch": 0.6580312393557491, "grad_norm": 0.645909860084504, "learning_rate": 1.0860419296484792e-05, "loss": 0.1107, "step": 8452 }, { "epoch": 0.6581090944479587, "grad_norm": 0.5939616825585994, "learning_rate": 1.0855979482898374e-05, "loss": 0.0756, "step": 8453 }, { "epoch": 0.6581869495401683, "grad_norm": 0.5303210846334077, "learning_rate": 1.085154023894145e-05, "loss": 0.0395, "step": 8454 }, { "epoch": 0.658264804632378, "grad_norm": 0.6741618389369659, "learning_rate": 1.0847101564890571e-05, "loss": 0.0859, "step": 8455 }, { "epoch": 0.6583426597245876, "grad_norm": 0.5882834270204145, "learning_rate": 1.0842663461022243e-05, "loss": 0.0585, "step": 8456 }, { "epoch": 0.6584205148167972, "grad_norm": 0.5910995618698568, "learning_rate": 1.0838225927612935e-05, "loss": 0.0544, "step": 8457 }, { "epoch": 0.6584983699090069, "grad_norm": 0.5998034051989023, "learning_rate": 1.0833788964939078e-05, "loss": 0.0607, "step": 8458 }, { "epoch": 0.6585762250012165, "grad_norm": 0.5988677816247332, "learning_rate": 1.0829352573277095e-05, "loss": 0.0594, "step": 8459 }, { "epoch": 0.6586540800934261, "grad_norm": 0.47639132073489393, "learning_rate": 1.082491675290334e-05, "loss": 0.036, "step": 8460 }, { "epoch": 0.6587319351856358, "grad_norm": 0.5385917036165742, "learning_rate": 1.0820481504094148e-05, "loss": 0.0722, "step": 8461 }, { "epoch": 0.6588097902778454, "grad_norm": 0.5824181844313653, "learning_rate": 1.0816046827125803e-05, "loss": 0.0636, "step": 8462 }, { "epoch": 0.658887645370055, "grad_norm": 0.5082138533738656, "learning_rate": 1.0811612722274579e-05, "loss": 0.0464, "step": 8463 }, { "epoch": 0.6589655004622647, "grad_norm": 0.5734981091237659, "learning_rate": 1.080717918981669e-05, "loss": 0.0735, "step": 8464 }, { "epoch": 0.6590433555544742, "grad_norm": 0.5343577197341719, "learning_rate": 1.0802746230028327e-05, "loss": 0.0643, "step": 8465 }, { "epoch": 0.6591212106466838, "grad_norm": 0.5510841859078872, "learning_rate": 1.0798313843185646e-05, "loss": 0.0491, "step": 8466 }, { "epoch": 0.6591990657388935, "grad_norm": 0.6541756456078047, "learning_rate": 1.079388202956476e-05, "loss": 0.0679, "step": 8467 }, { "epoch": 0.6592769208311031, "grad_norm": 0.6246960312651716, "learning_rate": 1.078945078944174e-05, "loss": 0.0639, "step": 8468 }, { "epoch": 0.6593547759233127, "grad_norm": 0.5367590386675744, "learning_rate": 1.0785020123092651e-05, "loss": 0.0443, "step": 8469 }, { "epoch": 0.6594326310155224, "grad_norm": 0.5922639552355454, "learning_rate": 1.0780590030793499e-05, "loss": 0.0608, "step": 8470 }, { "epoch": 0.659510486107732, "grad_norm": 0.6399948157121489, "learning_rate": 1.0776160512820249e-05, "loss": 0.0675, "step": 8471 }, { "epoch": 0.6595883411999416, "grad_norm": 0.5779152558644981, "learning_rate": 1.0771731569448843e-05, "loss": 0.0529, "step": 8472 }, { "epoch": 0.6596661962921513, "grad_norm": 0.607013182099162, "learning_rate": 1.0767303200955183e-05, "loss": 0.0713, "step": 8473 }, { "epoch": 0.6597440513843609, "grad_norm": 0.5387742994403995, "learning_rate": 1.0762875407615136e-05, "loss": 0.0486, "step": 8474 }, { "epoch": 0.6598219064765705, "grad_norm": 0.5268587314472335, "learning_rate": 1.0758448189704536e-05, "loss": 0.0505, "step": 8475 }, { "epoch": 0.6598997615687802, "grad_norm": 0.5568758753792659, "learning_rate": 1.0754021547499171e-05, "loss": 0.0532, "step": 8476 }, { "epoch": 0.6599776166609898, "grad_norm": 0.5109131377915697, "learning_rate": 1.0749595481274806e-05, "loss": 0.0527, "step": 8477 }, { "epoch": 0.6600554717531993, "grad_norm": 0.4994067257582289, "learning_rate": 1.074516999130716e-05, "loss": 0.0388, "step": 8478 }, { "epoch": 0.6601333268454089, "grad_norm": 0.5601230306157485, "learning_rate": 1.0740745077871917e-05, "loss": 0.0692, "step": 8479 }, { "epoch": 0.6602111819376186, "grad_norm": 0.7369650661789952, "learning_rate": 1.073632074124474e-05, "loss": 0.0855, "step": 8480 }, { "epoch": 0.6602890370298282, "grad_norm": 0.5600260535696343, "learning_rate": 1.0731896981701237e-05, "loss": 0.0543, "step": 8481 }, { "epoch": 0.6603668921220378, "grad_norm": 0.5072735804801556, "learning_rate": 1.0727473799516987e-05, "loss": 0.0568, "step": 8482 }, { "epoch": 0.6604447472142475, "grad_norm": 0.49600187071159635, "learning_rate": 1.0723051194967543e-05, "loss": 0.0479, "step": 8483 }, { "epoch": 0.6605226023064571, "grad_norm": 0.5924481799919777, "learning_rate": 1.071862916832839e-05, "loss": 0.0722, "step": 8484 }, { "epoch": 0.6606004573986667, "grad_norm": 0.6279920792048382, "learning_rate": 1.071420771987502e-05, "loss": 0.0697, "step": 8485 }, { "epoch": 0.6606783124908764, "grad_norm": 0.4440271642303403, "learning_rate": 1.0709786849882865e-05, "loss": 0.0378, "step": 8486 }, { "epoch": 0.660756167583086, "grad_norm": 0.6159999907759796, "learning_rate": 1.0705366558627315e-05, "loss": 0.0588, "step": 8487 }, { "epoch": 0.6608340226752956, "grad_norm": 0.5065913036578109, "learning_rate": 1.0700946846383744e-05, "loss": 0.0676, "step": 8488 }, { "epoch": 0.6609118777675053, "grad_norm": 0.47922337945636234, "learning_rate": 1.0696527713427466e-05, "loss": 0.0453, "step": 8489 }, { "epoch": 0.6609897328597149, "grad_norm": 0.6256722353208084, "learning_rate": 1.069210916003379e-05, "loss": 0.0599, "step": 8490 }, { "epoch": 0.6610675879519244, "grad_norm": 0.5419279331629115, "learning_rate": 1.0687691186477962e-05, "loss": 0.0583, "step": 8491 }, { "epoch": 0.6611454430441341, "grad_norm": 0.48563819735938824, "learning_rate": 1.0683273793035211e-05, "loss": 0.0355, "step": 8492 }, { "epoch": 0.6612232981363437, "grad_norm": 0.6984162290845378, "learning_rate": 1.0678856979980701e-05, "loss": 0.0964, "step": 8493 }, { "epoch": 0.6613011532285533, "grad_norm": 0.5516266015996543, "learning_rate": 1.0674440747589578e-05, "loss": 0.0539, "step": 8494 }, { "epoch": 0.661379008320763, "grad_norm": 0.4856889680619007, "learning_rate": 1.0670025096136974e-05, "loss": 0.043, "step": 8495 }, { "epoch": 0.6614568634129726, "grad_norm": 0.5331688116729892, "learning_rate": 1.0665610025897948e-05, "loss": 0.047, "step": 8496 }, { "epoch": 0.6615347185051822, "grad_norm": 0.5768699557668068, "learning_rate": 1.0661195537147546e-05, "loss": 0.0594, "step": 8497 }, { "epoch": 0.6616125735973919, "grad_norm": 0.5669598364663934, "learning_rate": 1.0656781630160762e-05, "loss": 0.0498, "step": 8498 }, { "epoch": 0.6616904286896015, "grad_norm": 0.6849439088006443, "learning_rate": 1.065236830521256e-05, "loss": 0.0819, "step": 8499 }, { "epoch": 0.6617682837818111, "grad_norm": 0.5406655376017121, "learning_rate": 1.0647955562577881e-05, "loss": 0.0472, "step": 8500 }, { "epoch": 0.6617682837818111, "eval_loss": 0.008035731501877308, "eval_runtime": 162.155, "eval_samples_per_second": 17.761, "eval_steps_per_second": 0.635, "step": 8500 }, { "epoch": 0.6618461388740208, "grad_norm": 0.6313815460193263, "learning_rate": 1.064354340253162e-05, "loss": 0.0889, "step": 8501 }, { "epoch": 0.6619239939662304, "grad_norm": 0.5715252557698248, "learning_rate": 1.0639131825348618e-05, "loss": 0.0467, "step": 8502 }, { "epoch": 0.66200184905844, "grad_norm": 0.5417292329083316, "learning_rate": 1.0634720831303704e-05, "loss": 0.0673, "step": 8503 }, { "epoch": 0.6620797041506497, "grad_norm": 0.4860151483558845, "learning_rate": 1.0630310420671654e-05, "loss": 0.0503, "step": 8504 }, { "epoch": 0.6621575592428592, "grad_norm": 0.5907376579839315, "learning_rate": 1.062590059372723e-05, "loss": 0.0585, "step": 8505 }, { "epoch": 0.6622354143350688, "grad_norm": 0.6807322083302769, "learning_rate": 1.0621491350745137e-05, "loss": 0.0806, "step": 8506 }, { "epoch": 0.6623132694272785, "grad_norm": 0.5280194402101535, "learning_rate": 1.061708269200005e-05, "loss": 0.0395, "step": 8507 }, { "epoch": 0.6623911245194881, "grad_norm": 0.5935869676559011, "learning_rate": 1.061267461776661e-05, "loss": 0.0885, "step": 8508 }, { "epoch": 0.6624689796116977, "grad_norm": 0.6238594640306652, "learning_rate": 1.0608267128319408e-05, "loss": 0.0693, "step": 8509 }, { "epoch": 0.6625468347039074, "grad_norm": 0.6673667047622766, "learning_rate": 1.0603860223933034e-05, "loss": 0.0786, "step": 8510 }, { "epoch": 0.662624689796117, "grad_norm": 0.6066247810618157, "learning_rate": 1.0599453904881993e-05, "loss": 0.064, "step": 8511 }, { "epoch": 0.6627025448883266, "grad_norm": 0.5446882900084412, "learning_rate": 1.0595048171440789e-05, "loss": 0.0495, "step": 8512 }, { "epoch": 0.6627803999805362, "grad_norm": 0.6086045877370174, "learning_rate": 1.0590643023883875e-05, "loss": 0.0628, "step": 8513 }, { "epoch": 0.6628582550727459, "grad_norm": 0.6131756907367945, "learning_rate": 1.0586238462485676e-05, "loss": 0.0864, "step": 8514 }, { "epoch": 0.6629361101649555, "grad_norm": 0.5906600820740023, "learning_rate": 1.0581834487520562e-05, "loss": 0.0602, "step": 8515 }, { "epoch": 0.663013965257165, "grad_norm": 0.5009025609096168, "learning_rate": 1.0577431099262898e-05, "loss": 0.0458, "step": 8516 }, { "epoch": 0.6630918203493747, "grad_norm": 0.48962493283401237, "learning_rate": 1.0573028297986987e-05, "loss": 0.0491, "step": 8517 }, { "epoch": 0.6631696754415843, "grad_norm": 0.5865120581552495, "learning_rate": 1.0568626083967101e-05, "loss": 0.0707, "step": 8518 }, { "epoch": 0.6632475305337939, "grad_norm": 0.6420346096425948, "learning_rate": 1.056422445747748e-05, "loss": 0.0664, "step": 8519 }, { "epoch": 0.6633253856260036, "grad_norm": 0.5526254315973461, "learning_rate": 1.0559823418792323e-05, "loss": 0.0408, "step": 8520 }, { "epoch": 0.6634032407182132, "grad_norm": 0.5763076583200496, "learning_rate": 1.0555422968185795e-05, "loss": 0.0639, "step": 8521 }, { "epoch": 0.6634810958104228, "grad_norm": 0.7086931954472598, "learning_rate": 1.0551023105932021e-05, "loss": 0.0976, "step": 8522 }, { "epoch": 0.6635589509026325, "grad_norm": 0.6340829707961203, "learning_rate": 1.0546623832305097e-05, "loss": 0.0879, "step": 8523 }, { "epoch": 0.6636368059948421, "grad_norm": 0.5497224075259216, "learning_rate": 1.0542225147579071e-05, "loss": 0.0623, "step": 8524 }, { "epoch": 0.6637146610870517, "grad_norm": 0.45548724020218534, "learning_rate": 1.0537827052027953e-05, "loss": 0.0479, "step": 8525 }, { "epoch": 0.6637925161792614, "grad_norm": 0.6425560208896431, "learning_rate": 1.0533429545925744e-05, "loss": 0.0837, "step": 8526 }, { "epoch": 0.663870371271471, "grad_norm": 0.5818574949691705, "learning_rate": 1.0529032629546377e-05, "loss": 0.0662, "step": 8527 }, { "epoch": 0.6639482263636806, "grad_norm": 0.6933340748200015, "learning_rate": 1.052463630316376e-05, "loss": 0.1099, "step": 8528 }, { "epoch": 0.6640260814558903, "grad_norm": 0.49142484805449976, "learning_rate": 1.052024056705176e-05, "loss": 0.0375, "step": 8529 }, { "epoch": 0.6641039365480998, "grad_norm": 0.4725293702883209, "learning_rate": 1.0515845421484218e-05, "loss": 0.05, "step": 8530 }, { "epoch": 0.6641817916403094, "grad_norm": 0.6229929803407279, "learning_rate": 1.0511450866734924e-05, "loss": 0.089, "step": 8531 }, { "epoch": 0.6642596467325191, "grad_norm": 0.6489190022447431, "learning_rate": 1.0507056903077644e-05, "loss": 0.0628, "step": 8532 }, { "epoch": 0.6643375018247287, "grad_norm": 0.5220635766734522, "learning_rate": 1.0502663530786097e-05, "loss": 0.061, "step": 8533 }, { "epoch": 0.6644153569169383, "grad_norm": 0.526307049112491, "learning_rate": 1.049827075013397e-05, "loss": 0.0473, "step": 8534 }, { "epoch": 0.664493212009148, "grad_norm": 0.7132375921160525, "learning_rate": 1.0493878561394904e-05, "loss": 0.1106, "step": 8535 }, { "epoch": 0.6645710671013576, "grad_norm": 0.5977138656927202, "learning_rate": 1.048948696484253e-05, "loss": 0.0805, "step": 8536 }, { "epoch": 0.6646489221935672, "grad_norm": 0.5841095783421979, "learning_rate": 1.0485095960750416e-05, "loss": 0.0953, "step": 8537 }, { "epoch": 0.6647267772857769, "grad_norm": 0.5824339279040593, "learning_rate": 1.0480705549392097e-05, "loss": 0.0534, "step": 8538 }, { "epoch": 0.6648046323779865, "grad_norm": 0.6025740387613564, "learning_rate": 1.0476315731041078e-05, "loss": 0.0605, "step": 8539 }, { "epoch": 0.6648824874701961, "grad_norm": 0.49908967199280924, "learning_rate": 1.0471926505970825e-05, "loss": 0.0511, "step": 8540 }, { "epoch": 0.6649603425624058, "grad_norm": 0.48949137960369254, "learning_rate": 1.0467537874454765e-05, "loss": 0.0443, "step": 8541 }, { "epoch": 0.6650381976546154, "grad_norm": 0.5558679096251894, "learning_rate": 1.0463149836766288e-05, "loss": 0.0514, "step": 8542 }, { "epoch": 0.665116052746825, "grad_norm": 0.641680180244653, "learning_rate": 1.045876239317875e-05, "loss": 0.0894, "step": 8543 }, { "epoch": 0.6651939078390346, "grad_norm": 0.5224211056971901, "learning_rate": 1.0454375543965468e-05, "loss": 0.0686, "step": 8544 }, { "epoch": 0.6652717629312442, "grad_norm": 0.5073699726284593, "learning_rate": 1.044998928939971e-05, "loss": 0.0488, "step": 8545 }, { "epoch": 0.6653496180234538, "grad_norm": 0.691721475172324, "learning_rate": 1.044560362975474e-05, "loss": 0.0997, "step": 8546 }, { "epoch": 0.6654274731156635, "grad_norm": 0.5282988232656411, "learning_rate": 1.0441218565303752e-05, "loss": 0.0669, "step": 8547 }, { "epoch": 0.6655053282078731, "grad_norm": 0.5975431661782099, "learning_rate": 1.043683409631992e-05, "loss": 0.0609, "step": 8548 }, { "epoch": 0.6655831833000827, "grad_norm": 0.6156021642617895, "learning_rate": 1.0432450223076383e-05, "loss": 0.0694, "step": 8549 }, { "epoch": 0.6656610383922923, "grad_norm": 0.6212268775862517, "learning_rate": 1.0428066945846206e-05, "loss": 0.0647, "step": 8550 }, { "epoch": 0.6656610383922923, "eval_loss": 0.008025011979043484, "eval_runtime": 162.3313, "eval_samples_per_second": 17.741, "eval_steps_per_second": 0.635, "step": 8550 }, { "epoch": 0.665738893484502, "grad_norm": 0.5580241148865186, "learning_rate": 1.0423684264902474e-05, "loss": 0.0677, "step": 8551 }, { "epoch": 0.6658167485767116, "grad_norm": 0.6459923330063796, "learning_rate": 1.04193021805182e-05, "loss": 0.0807, "step": 8552 }, { "epoch": 0.6658946036689212, "grad_norm": 0.563714331942701, "learning_rate": 1.0414920692966369e-05, "loss": 0.0472, "step": 8553 }, { "epoch": 0.6659724587611309, "grad_norm": 0.4624827015381957, "learning_rate": 1.0410539802519924e-05, "loss": 0.0423, "step": 8554 }, { "epoch": 0.6660503138533405, "grad_norm": 0.5346748622885147, "learning_rate": 1.0406159509451774e-05, "loss": 0.0488, "step": 8555 }, { "epoch": 0.66612816894555, "grad_norm": 0.5487904884803089, "learning_rate": 1.0401779814034785e-05, "loss": 0.0469, "step": 8556 }, { "epoch": 0.6662060240377597, "grad_norm": 0.6435321515420387, "learning_rate": 1.0397400716541806e-05, "loss": 0.0902, "step": 8557 }, { "epoch": 0.6662838791299693, "grad_norm": 0.5963801240970087, "learning_rate": 1.039302221724563e-05, "loss": 0.0764, "step": 8558 }, { "epoch": 0.6663617342221789, "grad_norm": 0.48024575739208103, "learning_rate": 1.038864431641901e-05, "loss": 0.0401, "step": 8559 }, { "epoch": 0.6664395893143886, "grad_norm": 0.6355116248187761, "learning_rate": 1.0384267014334668e-05, "loss": 0.083, "step": 8560 }, { "epoch": 0.6665174444065982, "grad_norm": 0.5655365311889672, "learning_rate": 1.0379890311265289e-05, "loss": 0.0664, "step": 8561 }, { "epoch": 0.6665952994988078, "grad_norm": 0.49294266602923775, "learning_rate": 1.037551420748353e-05, "loss": 0.0499, "step": 8562 }, { "epoch": 0.6666731545910175, "grad_norm": 0.49858104532635544, "learning_rate": 1.0371138703261998e-05, "loss": 0.0595, "step": 8563 }, { "epoch": 0.6667510096832271, "grad_norm": 0.5736841049943864, "learning_rate": 1.0366763798873265e-05, "loss": 0.0661, "step": 8564 }, { "epoch": 0.6668288647754367, "grad_norm": 0.5281909814300838, "learning_rate": 1.0362389494589866e-05, "loss": 0.0404, "step": 8565 }, { "epoch": 0.6669067198676464, "grad_norm": 0.6065063139698502, "learning_rate": 1.0358015790684295e-05, "loss": 0.0843, "step": 8566 }, { "epoch": 0.666984574959856, "grad_norm": 0.6109471495590498, "learning_rate": 1.0353642687429035e-05, "loss": 0.0629, "step": 8567 }, { "epoch": 0.6670624300520656, "grad_norm": 0.6979849694831305, "learning_rate": 1.0349270185096483e-05, "loss": 0.0927, "step": 8568 }, { "epoch": 0.6671402851442753, "grad_norm": 0.5302045259951603, "learning_rate": 1.0344898283959037e-05, "loss": 0.0623, "step": 8569 }, { "epoch": 0.6672181402364848, "grad_norm": 0.7149346810625312, "learning_rate": 1.0340526984289046e-05, "loss": 0.089, "step": 8570 }, { "epoch": 0.6672959953286944, "grad_norm": 0.5027672436016649, "learning_rate": 1.0336156286358814e-05, "loss": 0.0534, "step": 8571 }, { "epoch": 0.6673738504209041, "grad_norm": 0.6127598805645162, "learning_rate": 1.0331786190440627e-05, "loss": 0.0741, "step": 8572 }, { "epoch": 0.6674517055131137, "grad_norm": 0.574048440465166, "learning_rate": 1.0327416696806715e-05, "loss": 0.0731, "step": 8573 }, { "epoch": 0.6675295606053233, "grad_norm": 0.5543534014928647, "learning_rate": 1.0323047805729279e-05, "loss": 0.0647, "step": 8574 }, { "epoch": 0.667607415697533, "grad_norm": 0.6897075412817106, "learning_rate": 1.0318679517480477e-05, "loss": 0.0842, "step": 8575 }, { "epoch": 0.6676852707897426, "grad_norm": 0.5848680686762273, "learning_rate": 1.0314311832332437e-05, "loss": 0.0668, "step": 8576 }, { "epoch": 0.6677631258819522, "grad_norm": 0.5714627338160025, "learning_rate": 1.0309944750557244e-05, "loss": 0.0636, "step": 8577 }, { "epoch": 0.6678409809741619, "grad_norm": 0.556570308628972, "learning_rate": 1.0305578272426944e-05, "loss": 0.0753, "step": 8578 }, { "epoch": 0.6679188360663715, "grad_norm": 0.5288047275763588, "learning_rate": 1.0301212398213546e-05, "loss": 0.0587, "step": 8579 }, { "epoch": 0.6679966911585811, "grad_norm": 0.4642842773532599, "learning_rate": 1.0296847128189034e-05, "loss": 0.0596, "step": 8580 }, { "epoch": 0.6680745462507908, "grad_norm": 0.6505748692995776, "learning_rate": 1.0292482462625326e-05, "loss": 0.0756, "step": 8581 }, { "epoch": 0.6681524013430004, "grad_norm": 0.47614564352028854, "learning_rate": 1.0288118401794337e-05, "loss": 0.0382, "step": 8582 }, { "epoch": 0.66823025643521, "grad_norm": 0.5277515092482846, "learning_rate": 1.0283754945967926e-05, "loss": 0.0627, "step": 8583 }, { "epoch": 0.6683081115274195, "grad_norm": 0.5819129092089604, "learning_rate": 1.027939209541791e-05, "loss": 0.0621, "step": 8584 }, { "epoch": 0.6683859666196292, "grad_norm": 0.5458781663545389, "learning_rate": 1.0275029850416077e-05, "loss": 0.049, "step": 8585 }, { "epoch": 0.6684638217118388, "grad_norm": 0.5810549860399875, "learning_rate": 1.027066821123417e-05, "loss": 0.0621, "step": 8586 }, { "epoch": 0.6685416768040484, "grad_norm": 0.48626865018233295, "learning_rate": 1.0266307178143901e-05, "loss": 0.0414, "step": 8587 }, { "epoch": 0.6686195318962581, "grad_norm": 0.56263195893795, "learning_rate": 1.0261946751416944e-05, "loss": 0.0527, "step": 8588 }, { "epoch": 0.6686973869884677, "grad_norm": 0.5947830885036264, "learning_rate": 1.0257586931324935e-05, "loss": 0.043, "step": 8589 }, { "epoch": 0.6687752420806773, "grad_norm": 0.565258903603117, "learning_rate": 1.0253227718139462e-05, "loss": 0.0583, "step": 8590 }, { "epoch": 0.668853097172887, "grad_norm": 0.4854551822854257, "learning_rate": 1.024886911213209e-05, "loss": 0.0349, "step": 8591 }, { "epoch": 0.6689309522650966, "grad_norm": 0.6119226214245093, "learning_rate": 1.024451111357433e-05, "loss": 0.0814, "step": 8592 }, { "epoch": 0.6690088073573062, "grad_norm": 0.6027091996967677, "learning_rate": 1.0240153722737678e-05, "loss": 0.0528, "step": 8593 }, { "epoch": 0.6690866624495159, "grad_norm": 0.5288189165084078, "learning_rate": 1.0235796939893578e-05, "loss": 0.0416, "step": 8594 }, { "epoch": 0.6691645175417255, "grad_norm": 0.6204461182301343, "learning_rate": 1.0231440765313429e-05, "loss": 0.0698, "step": 8595 }, { "epoch": 0.669242372633935, "grad_norm": 0.5837497395069989, "learning_rate": 1.0227085199268607e-05, "loss": 0.0611, "step": 8596 }, { "epoch": 0.6693202277261447, "grad_norm": 1.0899860658519558, "learning_rate": 1.0222730242030439e-05, "loss": 0.0663, "step": 8597 }, { "epoch": 0.6693980828183543, "grad_norm": 0.5709506141700642, "learning_rate": 1.021837589387022e-05, "loss": 0.0724, "step": 8598 }, { "epoch": 0.6694759379105639, "grad_norm": 0.5457861195003038, "learning_rate": 1.0214022155059204e-05, "loss": 0.0606, "step": 8599 }, { "epoch": 0.6695537930027736, "grad_norm": 0.6760371177996721, "learning_rate": 1.0209669025868613e-05, "loss": 0.0748, "step": 8600 }, { "epoch": 0.6695537930027736, "eval_loss": 0.007984872907400131, "eval_runtime": 162.5379, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.634, "step": 8600 }, { "epoch": 0.6696316480949832, "grad_norm": 0.6539779204041065, "learning_rate": 1.020531650656962e-05, "loss": 0.0664, "step": 8601 }, { "epoch": 0.6697095031871928, "grad_norm": 0.46244997164035045, "learning_rate": 1.0200964597433362e-05, "loss": 0.0429, "step": 8602 }, { "epoch": 0.6697873582794025, "grad_norm": 0.5643366721868404, "learning_rate": 1.019661329873096e-05, "loss": 0.0641, "step": 8603 }, { "epoch": 0.6698652133716121, "grad_norm": 0.7399157825570136, "learning_rate": 1.0192262610733467e-05, "loss": 0.0941, "step": 8604 }, { "epoch": 0.6699430684638217, "grad_norm": 0.5802635538807606, "learning_rate": 1.0187912533711915e-05, "loss": 0.0577, "step": 8605 }, { "epoch": 0.6700209235560314, "grad_norm": 0.6558999868058272, "learning_rate": 1.018356306793729e-05, "loss": 0.0768, "step": 8606 }, { "epoch": 0.670098778648241, "grad_norm": 0.6355823179209337, "learning_rate": 1.0179214213680543e-05, "loss": 0.0704, "step": 8607 }, { "epoch": 0.6701766337404506, "grad_norm": 0.43897390664441266, "learning_rate": 1.0174865971212591e-05, "loss": 0.0354, "step": 8608 }, { "epoch": 0.6702544888326603, "grad_norm": 0.505388653216899, "learning_rate": 1.0170518340804306e-05, "loss": 0.0502, "step": 8609 }, { "epoch": 0.6703323439248698, "grad_norm": 0.6725862941992983, "learning_rate": 1.0166171322726526e-05, "loss": 0.0872, "step": 8610 }, { "epoch": 0.6704101990170794, "grad_norm": 0.48416367305364044, "learning_rate": 1.016182491725005e-05, "loss": 0.0382, "step": 8611 }, { "epoch": 0.6704880541092891, "grad_norm": 0.5336811513839996, "learning_rate": 1.0157479124645628e-05, "loss": 0.0489, "step": 8612 }, { "epoch": 0.6705659092014987, "grad_norm": 0.6248532337770435, "learning_rate": 1.0153133945184004e-05, "loss": 0.0972, "step": 8613 }, { "epoch": 0.6706437642937083, "grad_norm": 0.5097923505800932, "learning_rate": 1.0148789379135849e-05, "loss": 0.0527, "step": 8614 }, { "epoch": 0.670721619385918, "grad_norm": 0.5569689829193392, "learning_rate": 1.0144445426771811e-05, "loss": 0.0713, "step": 8615 }, { "epoch": 0.6707994744781276, "grad_norm": 0.4788965866896478, "learning_rate": 1.0140102088362504e-05, "loss": 0.0451, "step": 8616 }, { "epoch": 0.6708773295703372, "grad_norm": 0.6523938220897132, "learning_rate": 1.0135759364178477e-05, "loss": 0.065, "step": 8617 }, { "epoch": 0.6709551846625469, "grad_norm": 0.580543602622209, "learning_rate": 1.0131417254490286e-05, "loss": 0.0575, "step": 8618 }, { "epoch": 0.6710330397547565, "grad_norm": 0.5663989728473483, "learning_rate": 1.0127075759568408e-05, "loss": 0.0786, "step": 8619 }, { "epoch": 0.6711108948469661, "grad_norm": 0.5568789740959832, "learning_rate": 1.0122734879683306e-05, "loss": 0.0669, "step": 8620 }, { "epoch": 0.6711887499391757, "grad_norm": 0.43166984269296893, "learning_rate": 1.0118394615105395e-05, "loss": 0.0365, "step": 8621 }, { "epoch": 0.6712666050313854, "grad_norm": 0.49695980918328797, "learning_rate": 1.0114054966105042e-05, "loss": 0.0634, "step": 8622 }, { "epoch": 0.6713444601235949, "grad_norm": 0.5564092816079533, "learning_rate": 1.0109715932952607e-05, "loss": 0.0644, "step": 8623 }, { "epoch": 0.6714223152158045, "grad_norm": 0.49441074432681575, "learning_rate": 1.010537751591838e-05, "loss": 0.039, "step": 8624 }, { "epoch": 0.6715001703080142, "grad_norm": 1.0790793269678591, "learning_rate": 1.010103971527263e-05, "loss": 0.0774, "step": 8625 }, { "epoch": 0.6715780254002238, "grad_norm": 0.6869463603423202, "learning_rate": 1.0096702531285571e-05, "loss": 0.081, "step": 8626 }, { "epoch": 0.6716558804924334, "grad_norm": 0.6259096631283718, "learning_rate": 1.0092365964227388e-05, "loss": 0.0892, "step": 8627 }, { "epoch": 0.6717337355846431, "grad_norm": 0.49701393279228534, "learning_rate": 1.0088030014368245e-05, "loss": 0.0468, "step": 8628 }, { "epoch": 0.6718115906768527, "grad_norm": 0.5592467369204811, "learning_rate": 1.0083694681978239e-05, "loss": 0.0649, "step": 8629 }, { "epoch": 0.6718894457690623, "grad_norm": 0.5319610340916182, "learning_rate": 1.0079359967327441e-05, "loss": 0.0621, "step": 8630 }, { "epoch": 0.671967300861272, "grad_norm": 0.5811475500044387, "learning_rate": 1.0075025870685888e-05, "loss": 0.0665, "step": 8631 }, { "epoch": 0.6720451559534816, "grad_norm": 0.5124151962706149, "learning_rate": 1.007069239232357e-05, "loss": 0.0516, "step": 8632 }, { "epoch": 0.6721230110456912, "grad_norm": 0.5078339936221908, "learning_rate": 1.006635953251044e-05, "loss": 0.0451, "step": 8633 }, { "epoch": 0.6722008661379009, "grad_norm": 0.5576767877836559, "learning_rate": 1.0062027291516431e-05, "loss": 0.0466, "step": 8634 }, { "epoch": 0.6722787212301105, "grad_norm": 0.5294818057653801, "learning_rate": 1.0057695669611407e-05, "loss": 0.0553, "step": 8635 }, { "epoch": 0.67235657632232, "grad_norm": 0.6045006580546626, "learning_rate": 1.0053364667065205e-05, "loss": 0.0673, "step": 8636 }, { "epoch": 0.6724344314145297, "grad_norm": 0.5831523586862282, "learning_rate": 1.004903428414763e-05, "loss": 0.063, "step": 8637 }, { "epoch": 0.6725122865067393, "grad_norm": 0.5798692923581008, "learning_rate": 1.0044704521128442e-05, "loss": 0.0693, "step": 8638 }, { "epoch": 0.6725901415989489, "grad_norm": 0.4641172012438579, "learning_rate": 1.0040375378277374e-05, "loss": 0.055, "step": 8639 }, { "epoch": 0.6726679966911586, "grad_norm": 0.5568718424637692, "learning_rate": 1.0036046855864104e-05, "loss": 0.0457, "step": 8640 }, { "epoch": 0.6727458517833682, "grad_norm": 0.5391368110075286, "learning_rate": 1.003171895415828e-05, "loss": 0.0567, "step": 8641 }, { "epoch": 0.6728237068755778, "grad_norm": 0.4915172426335575, "learning_rate": 1.0027391673429511e-05, "loss": 0.0399, "step": 8642 }, { "epoch": 0.6729015619677875, "grad_norm": 0.45142460368910253, "learning_rate": 1.0023065013947368e-05, "loss": 0.0384, "step": 8643 }, { "epoch": 0.6729794170599971, "grad_norm": 0.5083906924916973, "learning_rate": 1.0018738975981374e-05, "loss": 0.0485, "step": 8644 }, { "epoch": 0.6730572721522067, "grad_norm": 0.5768959761178437, "learning_rate": 1.0014413559801031e-05, "loss": 0.0475, "step": 8645 }, { "epoch": 0.6731351272444164, "grad_norm": 0.6390408898453215, "learning_rate": 1.0010088765675783e-05, "loss": 0.0752, "step": 8646 }, { "epoch": 0.673212982336626, "grad_norm": 0.551471720768124, "learning_rate": 1.0005764593875052e-05, "loss": 0.0632, "step": 8647 }, { "epoch": 0.6732908374288356, "grad_norm": 0.5341722158719215, "learning_rate": 1.0001441044668199e-05, "loss": 0.0655, "step": 8648 }, { "epoch": 0.6733686925210453, "grad_norm": 0.5203879858076023, "learning_rate": 9.997118118324577e-06, "loss": 0.0429, "step": 8649 }, { "epoch": 0.6734465476132548, "grad_norm": 0.48649758302691665, "learning_rate": 9.992795815113483e-06, "loss": 0.0353, "step": 8650 }, { "epoch": 0.6734465476132548, "eval_loss": 0.007903315126895905, "eval_runtime": 162.2146, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 8650 }, { "epoch": 0.6735244027054644, "grad_norm": 0.5685833287759209, "learning_rate": 9.988474135304169e-06, "loss": 0.0558, "step": 8651 }, { "epoch": 0.6736022577976741, "grad_norm": 0.6627419395623309, "learning_rate": 9.984153079165859e-06, "loss": 0.0691, "step": 8652 }, { "epoch": 0.6736801128898837, "grad_norm": 0.6043179302297976, "learning_rate": 9.979832646967732e-06, "loss": 0.0623, "step": 8653 }, { "epoch": 0.6737579679820933, "grad_norm": 0.5200489854651241, "learning_rate": 9.975512838978935e-06, "loss": 0.0537, "step": 8654 }, { "epoch": 0.673835823074303, "grad_norm": 0.6283757565648229, "learning_rate": 9.971193655468567e-06, "loss": 0.0653, "step": 8655 }, { "epoch": 0.6739136781665126, "grad_norm": 0.6107700424662902, "learning_rate": 9.966875096705695e-06, "loss": 0.0695, "step": 8656 }, { "epoch": 0.6739915332587222, "grad_norm": 0.47806549177377333, "learning_rate": 9.962557162959345e-06, "loss": 0.0434, "step": 8657 }, { "epoch": 0.6740693883509318, "grad_norm": 0.5136375754949766, "learning_rate": 9.958239854498496e-06, "loss": 0.0442, "step": 8658 }, { "epoch": 0.6741472434431415, "grad_norm": 0.9013837882916759, "learning_rate": 9.953923171592113e-06, "loss": 0.0524, "step": 8659 }, { "epoch": 0.6742250985353511, "grad_norm": 0.44990647574320447, "learning_rate": 9.949607114509093e-06, "loss": 0.0337, "step": 8660 }, { "epoch": 0.6743029536275607, "grad_norm": 0.43491630151404614, "learning_rate": 9.945291683518307e-06, "loss": 0.0436, "step": 8661 }, { "epoch": 0.6743808087197704, "grad_norm": 0.6307155088434331, "learning_rate": 9.940976878888592e-06, "loss": 0.0613, "step": 8662 }, { "epoch": 0.6744586638119799, "grad_norm": 0.6146092447741857, "learning_rate": 9.936662700888736e-06, "loss": 0.0706, "step": 8663 }, { "epoch": 0.6745365189041895, "grad_norm": 0.6016585275282395, "learning_rate": 9.932349149787487e-06, "loss": 0.0851, "step": 8664 }, { "epoch": 0.6746143739963992, "grad_norm": 0.5673754786835821, "learning_rate": 9.928036225853567e-06, "loss": 0.0622, "step": 8665 }, { "epoch": 0.6746922290886088, "grad_norm": 0.5648701479248575, "learning_rate": 9.923723929355648e-06, "loss": 0.0658, "step": 8666 }, { "epoch": 0.6747700841808184, "grad_norm": 0.5104332951818359, "learning_rate": 9.919412260562366e-06, "loss": 0.0502, "step": 8667 }, { "epoch": 0.6748479392730281, "grad_norm": 0.6002540737118502, "learning_rate": 9.915101219742316e-06, "loss": 0.0737, "step": 8668 }, { "epoch": 0.6749257943652377, "grad_norm": 0.5828295551569351, "learning_rate": 9.91079080716405e-06, "loss": 0.0506, "step": 8669 }, { "epoch": 0.6750036494574473, "grad_norm": 0.6245127235984473, "learning_rate": 9.906481023096099e-06, "loss": 0.0761, "step": 8670 }, { "epoch": 0.675081504549657, "grad_norm": 0.47563412699545793, "learning_rate": 9.90217186780694e-06, "loss": 0.0347, "step": 8671 }, { "epoch": 0.6751593596418666, "grad_norm": 0.6897357676362639, "learning_rate": 9.897863341565005e-06, "loss": 0.0878, "step": 8672 }, { "epoch": 0.6752372147340762, "grad_norm": 0.5315821847999196, "learning_rate": 9.893555444638702e-06, "loss": 0.0611, "step": 8673 }, { "epoch": 0.6753150698262859, "grad_norm": 0.5797421453134157, "learning_rate": 9.889248177296391e-06, "loss": 0.0658, "step": 8674 }, { "epoch": 0.6753929249184955, "grad_norm": 0.5782507973075497, "learning_rate": 9.884941539806394e-06, "loss": 0.063, "step": 8675 }, { "epoch": 0.675470780010705, "grad_norm": 0.5186336026982754, "learning_rate": 9.880635532436993e-06, "loss": 0.0543, "step": 8676 }, { "epoch": 0.6755486351029147, "grad_norm": 0.5352435533608786, "learning_rate": 9.876330155456436e-06, "loss": 0.0552, "step": 8677 }, { "epoch": 0.6756264901951243, "grad_norm": 0.582939921540995, "learning_rate": 9.872025409132922e-06, "loss": 0.0669, "step": 8678 }, { "epoch": 0.6757043452873339, "grad_norm": 0.5397526075912732, "learning_rate": 9.867721293734612e-06, "loss": 0.0455, "step": 8679 }, { "epoch": 0.6757822003795436, "grad_norm": 0.5007371651399506, "learning_rate": 9.863417809529645e-06, "loss": 0.0396, "step": 8680 }, { "epoch": 0.6758600554717532, "grad_norm": 0.6461912848082502, "learning_rate": 9.859114956786105e-06, "loss": 0.0625, "step": 8681 }, { "epoch": 0.6759379105639628, "grad_norm": 0.49137727730190034, "learning_rate": 9.854812735772038e-06, "loss": 0.0398, "step": 8682 }, { "epoch": 0.6760157656561725, "grad_norm": 0.5736233839020062, "learning_rate": 9.850511146755455e-06, "loss": 0.0476, "step": 8683 }, { "epoch": 0.6760936207483821, "grad_norm": 0.5754302511346506, "learning_rate": 9.846210190004308e-06, "loss": 0.054, "step": 8684 }, { "epoch": 0.6761714758405917, "grad_norm": 0.6110849276260604, "learning_rate": 9.841909865786546e-06, "loss": 0.0812, "step": 8685 }, { "epoch": 0.6762493309328014, "grad_norm": 0.4118577019323366, "learning_rate": 9.837610174370051e-06, "loss": 0.0316, "step": 8686 }, { "epoch": 0.676327186025011, "grad_norm": 0.5203848863280269, "learning_rate": 9.833311116022676e-06, "loss": 0.0545, "step": 8687 }, { "epoch": 0.6764050411172206, "grad_norm": 0.5826921734000658, "learning_rate": 9.829012691012233e-06, "loss": 0.0586, "step": 8688 }, { "epoch": 0.6764828962094303, "grad_norm": 0.5699265903464588, "learning_rate": 9.82471489960648e-06, "loss": 0.0783, "step": 8689 }, { "epoch": 0.6765607513016398, "grad_norm": 0.41806230052412763, "learning_rate": 9.82041774207317e-06, "loss": 0.0262, "step": 8690 }, { "epoch": 0.6766386063938494, "grad_norm": 0.6441651000836554, "learning_rate": 9.816121218679987e-06, "loss": 0.0666, "step": 8691 }, { "epoch": 0.676716461486059, "grad_norm": 0.46933615119420924, "learning_rate": 9.811825329694591e-06, "loss": 0.0377, "step": 8692 }, { "epoch": 0.6767943165782687, "grad_norm": 0.6412034209605169, "learning_rate": 9.80753007538458e-06, "loss": 0.0744, "step": 8693 }, { "epoch": 0.6768721716704783, "grad_norm": 0.5012896654895531, "learning_rate": 9.803235456017528e-06, "loss": 0.0537, "step": 8694 }, { "epoch": 0.6769500267626879, "grad_norm": 0.6182429793983972, "learning_rate": 9.798941471860988e-06, "loss": 0.0792, "step": 8695 }, { "epoch": 0.6770278818548976, "grad_norm": 0.5276982161744604, "learning_rate": 9.794648123182444e-06, "loss": 0.0484, "step": 8696 }, { "epoch": 0.6771057369471072, "grad_norm": 0.6541906893322846, "learning_rate": 9.790355410249353e-06, "loss": 0.0852, "step": 8697 }, { "epoch": 0.6771835920393168, "grad_norm": 0.5511741537156027, "learning_rate": 9.78606333332913e-06, "loss": 0.0512, "step": 8698 }, { "epoch": 0.6772614471315265, "grad_norm": 0.5047044130447703, "learning_rate": 9.781771892689142e-06, "loss": 0.0493, "step": 8699 }, { "epoch": 0.6773393022237361, "grad_norm": 0.5977238489250258, "learning_rate": 9.777481088596745e-06, "loss": 0.07, "step": 8700 }, { "epoch": 0.6773393022237361, "eval_loss": 0.007770792115479708, "eval_runtime": 163.0123, "eval_samples_per_second": 17.667, "eval_steps_per_second": 0.632, "step": 8700 }, { "epoch": 0.6774171573159457, "grad_norm": 0.5831179257010553, "learning_rate": 9.773190921319233e-06, "loss": 0.0616, "step": 8701 }, { "epoch": 0.6774950124081554, "grad_norm": 0.5691201506218744, "learning_rate": 9.76890139112385e-06, "loss": 0.0511, "step": 8702 }, { "epoch": 0.6775728675003649, "grad_norm": 0.4986673503162846, "learning_rate": 9.764612498277818e-06, "loss": 0.0571, "step": 8703 }, { "epoch": 0.6776507225925745, "grad_norm": 0.4729167676485063, "learning_rate": 9.760324243048311e-06, "loss": 0.0454, "step": 8704 }, { "epoch": 0.6777285776847842, "grad_norm": 0.579431207148761, "learning_rate": 9.756036625702479e-06, "loss": 0.0706, "step": 8705 }, { "epoch": 0.6778064327769938, "grad_norm": 0.4934322573897286, "learning_rate": 9.751749646507415e-06, "loss": 0.0427, "step": 8706 }, { "epoch": 0.6778842878692034, "grad_norm": 0.5226872238302774, "learning_rate": 9.747463305730178e-06, "loss": 0.059, "step": 8707 }, { "epoch": 0.6779621429614131, "grad_norm": 0.5297563328126038, "learning_rate": 9.743177603637783e-06, "loss": 0.0452, "step": 8708 }, { "epoch": 0.6780399980536227, "grad_norm": 0.5344235981124397, "learning_rate": 9.738892540497216e-06, "loss": 0.0561, "step": 8709 }, { "epoch": 0.6781178531458323, "grad_norm": 0.6508459694357739, "learning_rate": 9.734608116575408e-06, "loss": 0.0877, "step": 8710 }, { "epoch": 0.678195708238042, "grad_norm": 0.5703368375401059, "learning_rate": 9.730324332139265e-06, "loss": 0.0637, "step": 8711 }, { "epoch": 0.6782735633302516, "grad_norm": 0.6177700600081482, "learning_rate": 9.726041187455645e-06, "loss": 0.0828, "step": 8712 }, { "epoch": 0.6783514184224612, "grad_norm": 0.574443791877087, "learning_rate": 9.721758682791367e-06, "loss": 0.0646, "step": 8713 }, { "epoch": 0.6784292735146709, "grad_norm": 0.6103685597954277, "learning_rate": 9.717476818413209e-06, "loss": 0.0692, "step": 8714 }, { "epoch": 0.6785071286068804, "grad_norm": 0.6205196081400158, "learning_rate": 9.713195594587908e-06, "loss": 0.0716, "step": 8715 }, { "epoch": 0.67858498369909, "grad_norm": 0.5256414793022064, "learning_rate": 9.708915011582177e-06, "loss": 0.0651, "step": 8716 }, { "epoch": 0.6786628387912997, "grad_norm": 0.6802664806760007, "learning_rate": 9.704635069662664e-06, "loss": 0.0937, "step": 8717 }, { "epoch": 0.6787406938835093, "grad_norm": 0.5972457325525865, "learning_rate": 9.700355769095999e-06, "loss": 0.0779, "step": 8718 }, { "epoch": 0.6788185489757189, "grad_norm": 0.5102378302429817, "learning_rate": 9.696077110148755e-06, "loss": 0.0511, "step": 8719 }, { "epoch": 0.6788964040679286, "grad_norm": 0.5115105286270833, "learning_rate": 9.691799093087472e-06, "loss": 0.0515, "step": 8720 }, { "epoch": 0.6789742591601382, "grad_norm": 0.5300020572267947, "learning_rate": 9.687521718178656e-06, "loss": 0.0672, "step": 8721 }, { "epoch": 0.6790521142523478, "grad_norm": 0.451120702369483, "learning_rate": 9.683244985688762e-06, "loss": 0.0272, "step": 8722 }, { "epoch": 0.6791299693445575, "grad_norm": 0.4748198120718784, "learning_rate": 9.678968895884213e-06, "loss": 0.0509, "step": 8723 }, { "epoch": 0.6792078244367671, "grad_norm": 0.5515617956362598, "learning_rate": 9.67469344903139e-06, "loss": 0.0578, "step": 8724 }, { "epoch": 0.6792856795289767, "grad_norm": 0.49406256974401014, "learning_rate": 9.670418645396623e-06, "loss": 0.0592, "step": 8725 }, { "epoch": 0.6793635346211864, "grad_norm": 0.4943307255233418, "learning_rate": 9.66614448524623e-06, "loss": 0.0557, "step": 8726 }, { "epoch": 0.679441389713396, "grad_norm": 0.6279655634825675, "learning_rate": 9.661870968846459e-06, "loss": 0.0895, "step": 8727 }, { "epoch": 0.6795192448056055, "grad_norm": 0.47466404712061017, "learning_rate": 9.657598096463536e-06, "loss": 0.0436, "step": 8728 }, { "epoch": 0.6795970998978151, "grad_norm": 0.5220199886619681, "learning_rate": 9.653325868363637e-06, "loss": 0.0499, "step": 8729 }, { "epoch": 0.6796749549900248, "grad_norm": 0.5005602925633776, "learning_rate": 9.649054284812902e-06, "loss": 0.0503, "step": 8730 }, { "epoch": 0.6797528100822344, "grad_norm": 0.5057221786477398, "learning_rate": 9.644783346077433e-06, "loss": 0.0599, "step": 8731 }, { "epoch": 0.679830665174444, "grad_norm": 0.5789058353823537, "learning_rate": 9.640513052423287e-06, "loss": 0.0668, "step": 8732 }, { "epoch": 0.6799085202666537, "grad_norm": 0.5233923479406953, "learning_rate": 9.636243404116484e-06, "loss": 0.0477, "step": 8733 }, { "epoch": 0.6799863753588633, "grad_norm": 0.6628630902298284, "learning_rate": 9.631974401423e-06, "loss": 0.1152, "step": 8734 }, { "epoch": 0.6800642304510729, "grad_norm": 0.5292712689394901, "learning_rate": 9.627706044608774e-06, "loss": 0.0509, "step": 8735 }, { "epoch": 0.6801420855432826, "grad_norm": 0.5150906709477373, "learning_rate": 9.623438333939711e-06, "loss": 0.0479, "step": 8736 }, { "epoch": 0.6802199406354922, "grad_norm": 0.5321376297517495, "learning_rate": 9.619171269681668e-06, "loss": 0.0611, "step": 8737 }, { "epoch": 0.6802977957277018, "grad_norm": 0.6721615465226185, "learning_rate": 9.614904852100462e-06, "loss": 0.0922, "step": 8738 }, { "epoch": 0.6803756508199115, "grad_norm": 0.6008056960918065, "learning_rate": 9.610639081461868e-06, "loss": 0.0828, "step": 8739 }, { "epoch": 0.6804535059121211, "grad_norm": 0.47326707511621274, "learning_rate": 9.606373958031622e-06, "loss": 0.0472, "step": 8740 }, { "epoch": 0.6805313610043306, "grad_norm": 0.5220977325653704, "learning_rate": 9.602109482075428e-06, "loss": 0.0715, "step": 8741 }, { "epoch": 0.6806092160965403, "grad_norm": 0.4761160171276531, "learning_rate": 9.597845653858936e-06, "loss": 0.0513, "step": 8742 }, { "epoch": 0.6806870711887499, "grad_norm": 0.5243314155890143, "learning_rate": 9.59358247364777e-06, "loss": 0.0644, "step": 8743 }, { "epoch": 0.6807649262809595, "grad_norm": 0.5716733707600263, "learning_rate": 9.589319941707497e-06, "loss": 0.0582, "step": 8744 }, { "epoch": 0.6808427813731692, "grad_norm": 0.5180957033682925, "learning_rate": 9.585058058303657e-06, "loss": 0.0506, "step": 8745 }, { "epoch": 0.6809206364653788, "grad_norm": 0.5195514375720985, "learning_rate": 9.580796823701742e-06, "loss": 0.057, "step": 8746 }, { "epoch": 0.6809984915575884, "grad_norm": 0.5450540628336261, "learning_rate": 9.576536238167214e-06, "loss": 0.0532, "step": 8747 }, { "epoch": 0.6810763466497981, "grad_norm": 0.5111969363638958, "learning_rate": 9.572276301965485e-06, "loss": 0.046, "step": 8748 }, { "epoch": 0.6811542017420077, "grad_norm": 0.47159166106336164, "learning_rate": 9.568017015361927e-06, "loss": 0.034, "step": 8749 }, { "epoch": 0.6812320568342173, "grad_norm": 0.524511410802421, "learning_rate": 9.563758378621884e-06, "loss": 0.0574, "step": 8750 }, { "epoch": 0.6812320568342173, "eval_loss": 0.00768662802875042, "eval_runtime": 162.6473, "eval_samples_per_second": 17.707, "eval_steps_per_second": 0.633, "step": 8750 }, { "epoch": 0.681309911926427, "grad_norm": 0.5684916034226112, "learning_rate": 9.559500392010621e-06, "loss": 0.0618, "step": 8751 }, { "epoch": 0.6813877670186366, "grad_norm": 0.5915400518363103, "learning_rate": 9.555243055793417e-06, "loss": 0.0762, "step": 8752 }, { "epoch": 0.6814656221108462, "grad_norm": 0.4798958080574479, "learning_rate": 9.550986370235478e-06, "loss": 0.0471, "step": 8753 }, { "epoch": 0.6815434772030559, "grad_norm": 0.6290992266336688, "learning_rate": 9.546730335601972e-06, "loss": 0.0684, "step": 8754 }, { "epoch": 0.6816213322952654, "grad_norm": 0.6197446643345261, "learning_rate": 9.542474952158031e-06, "loss": 0.0721, "step": 8755 }, { "epoch": 0.681699187387475, "grad_norm": 0.4650315235121937, "learning_rate": 9.538220220168741e-06, "loss": 0.0345, "step": 8756 }, { "epoch": 0.6817770424796847, "grad_norm": 0.5297848370846564, "learning_rate": 9.533966139899163e-06, "loss": 0.055, "step": 8757 }, { "epoch": 0.6818548975718943, "grad_norm": 0.6274562711547647, "learning_rate": 9.5297127116143e-06, "loss": 0.0742, "step": 8758 }, { "epoch": 0.6819327526641039, "grad_norm": 0.5883317879347084, "learning_rate": 9.525459935579124e-06, "loss": 0.0689, "step": 8759 }, { "epoch": 0.6820106077563136, "grad_norm": 0.5757412241737339, "learning_rate": 9.521207812058556e-06, "loss": 0.0569, "step": 8760 }, { "epoch": 0.6820884628485232, "grad_norm": 0.42996332489315015, "learning_rate": 9.516956341317478e-06, "loss": 0.0404, "step": 8761 }, { "epoch": 0.6821663179407328, "grad_norm": 0.5717397677683455, "learning_rate": 9.512705523620752e-06, "loss": 0.061, "step": 8762 }, { "epoch": 0.6822441730329425, "grad_norm": 0.47880153026769434, "learning_rate": 9.508455359233177e-06, "loss": 0.0487, "step": 8763 }, { "epoch": 0.6823220281251521, "grad_norm": 0.6276029555298848, "learning_rate": 9.504205848419521e-06, "loss": 0.0666, "step": 8764 }, { "epoch": 0.6823998832173617, "grad_norm": 0.611909164903314, "learning_rate": 9.499956991444503e-06, "loss": 0.076, "step": 8765 }, { "epoch": 0.6824777383095713, "grad_norm": 0.5299648564282492, "learning_rate": 9.495708788572803e-06, "loss": 0.0449, "step": 8766 }, { "epoch": 0.682555593401781, "grad_norm": 0.41280104682151564, "learning_rate": 9.49146124006908e-06, "loss": 0.0363, "step": 8767 }, { "epoch": 0.6826334484939905, "grad_norm": 0.5601903561165287, "learning_rate": 9.487214346197935e-06, "loss": 0.0655, "step": 8768 }, { "epoch": 0.6827113035862001, "grad_norm": 0.5386562380577994, "learning_rate": 9.482968107223913e-06, "loss": 0.06, "step": 8769 }, { "epoch": 0.6827891586784098, "grad_norm": 0.5797760296712601, "learning_rate": 9.478722523411543e-06, "loss": 0.0607, "step": 8770 }, { "epoch": 0.6828670137706194, "grad_norm": 0.5126192571838204, "learning_rate": 9.4744775950253e-06, "loss": 0.0389, "step": 8771 }, { "epoch": 0.682944868862829, "grad_norm": 0.5825742701484212, "learning_rate": 9.470233322329635e-06, "loss": 0.0512, "step": 8772 }, { "epoch": 0.6830227239550387, "grad_norm": 0.6038778040377072, "learning_rate": 9.465989705588939e-06, "loss": 0.058, "step": 8773 }, { "epoch": 0.6831005790472483, "grad_norm": 0.5489059563491212, "learning_rate": 9.46174674506757e-06, "loss": 0.0609, "step": 8774 }, { "epoch": 0.6831784341394579, "grad_norm": 0.5334647145708608, "learning_rate": 9.457504441029845e-06, "loss": 0.0548, "step": 8775 }, { "epoch": 0.6832562892316676, "grad_norm": 0.5083253890459877, "learning_rate": 9.453262793740032e-06, "loss": 0.0495, "step": 8776 }, { "epoch": 0.6833341443238772, "grad_norm": 0.4606286400094224, "learning_rate": 9.449021803462389e-06, "loss": 0.0417, "step": 8777 }, { "epoch": 0.6834119994160868, "grad_norm": 0.5737722643975387, "learning_rate": 9.444781470461084e-06, "loss": 0.0508, "step": 8778 }, { "epoch": 0.6834898545082965, "grad_norm": 0.6050648929714952, "learning_rate": 9.44054179500028e-06, "loss": 0.0819, "step": 8779 }, { "epoch": 0.6835677096005061, "grad_norm": 0.5493102473200087, "learning_rate": 9.43630277734409e-06, "loss": 0.046, "step": 8780 }, { "epoch": 0.6836455646927156, "grad_norm": 0.5127497034699686, "learning_rate": 9.432064417756572e-06, "loss": 0.0608, "step": 8781 }, { "epoch": 0.6837234197849253, "grad_norm": 0.6374157112684709, "learning_rate": 9.427826716501778e-06, "loss": 0.0588, "step": 8782 }, { "epoch": 0.6838012748771349, "grad_norm": 0.5788128479433176, "learning_rate": 9.423589673843685e-06, "loss": 0.0504, "step": 8783 }, { "epoch": 0.6838791299693445, "grad_norm": 0.559391828665395, "learning_rate": 9.419353290046241e-06, "loss": 0.0676, "step": 8784 }, { "epoch": 0.6839569850615542, "grad_norm": 0.47371098068146444, "learning_rate": 9.415117565373354e-06, "loss": 0.0348, "step": 8785 }, { "epoch": 0.6840348401537638, "grad_norm": 0.576143674002872, "learning_rate": 9.410882500088891e-06, "loss": 0.073, "step": 8786 }, { "epoch": 0.6841126952459734, "grad_norm": 0.4234617359513897, "learning_rate": 9.406648094456674e-06, "loss": 0.0337, "step": 8787 }, { "epoch": 0.6841905503381831, "grad_norm": 0.5410920638538658, "learning_rate": 9.402414348740488e-06, "loss": 0.0521, "step": 8788 }, { "epoch": 0.6842684054303927, "grad_norm": 0.633640450064397, "learning_rate": 9.398181263204074e-06, "loss": 0.0541, "step": 8789 }, { "epoch": 0.6843462605226023, "grad_norm": 0.522501545374407, "learning_rate": 9.393948838111134e-06, "loss": 0.0552, "step": 8790 }, { "epoch": 0.684424115614812, "grad_norm": 0.5176534350817592, "learning_rate": 9.389717073725328e-06, "loss": 0.0529, "step": 8791 }, { "epoch": 0.6845019707070216, "grad_norm": 0.5429851707649838, "learning_rate": 9.38548597031027e-06, "loss": 0.0481, "step": 8792 }, { "epoch": 0.6845798257992312, "grad_norm": 0.5817113698910953, "learning_rate": 9.381255528129549e-06, "loss": 0.0676, "step": 8793 }, { "epoch": 0.6846576808914409, "grad_norm": 0.5635218024657802, "learning_rate": 9.377025747446698e-06, "loss": 0.0693, "step": 8794 }, { "epoch": 0.6847355359836504, "grad_norm": 0.5860541106463231, "learning_rate": 9.372796628525212e-06, "loss": 0.0641, "step": 8795 }, { "epoch": 0.68481339107586, "grad_norm": 0.47702886478688855, "learning_rate": 9.368568171628541e-06, "loss": 0.0479, "step": 8796 }, { "epoch": 0.6848912461680697, "grad_norm": 0.5463380641138018, "learning_rate": 9.3643403770201e-06, "loss": 0.0523, "step": 8797 }, { "epoch": 0.6849691012602793, "grad_norm": 0.5761212940802517, "learning_rate": 9.360113244963263e-06, "loss": 0.0636, "step": 8798 }, { "epoch": 0.6850469563524889, "grad_norm": 0.5671022918445442, "learning_rate": 9.355886775721358e-06, "loss": 0.0627, "step": 8799 }, { "epoch": 0.6851248114446985, "grad_norm": 0.505642575558869, "learning_rate": 9.351660969557677e-06, "loss": 0.0681, "step": 8800 }, { "epoch": 0.6851248114446985, "eval_loss": 0.007619519252330065, "eval_runtime": 162.2499, "eval_samples_per_second": 17.75, "eval_steps_per_second": 0.635, "step": 8800 }, { "epoch": 0.6852026665369082, "grad_norm": 0.5995032732087232, "learning_rate": 9.347435826735465e-06, "loss": 0.0858, "step": 8801 }, { "epoch": 0.6852805216291178, "grad_norm": 0.4806530060949017, "learning_rate": 9.343211347517922e-06, "loss": 0.0362, "step": 8802 }, { "epoch": 0.6853583767213274, "grad_norm": 0.6185406296389646, "learning_rate": 9.338987532168228e-06, "loss": 0.0831, "step": 8803 }, { "epoch": 0.6854362318135371, "grad_norm": 0.5356022528069438, "learning_rate": 9.3347643809495e-06, "loss": 0.0653, "step": 8804 }, { "epoch": 0.6855140869057467, "grad_norm": 0.46506077951090413, "learning_rate": 9.33054189412482e-06, "loss": 0.0338, "step": 8805 }, { "epoch": 0.6855919419979563, "grad_norm": 0.4528277444623677, "learning_rate": 9.326320071957228e-06, "loss": 0.0429, "step": 8806 }, { "epoch": 0.685669797090166, "grad_norm": 0.5713059038930328, "learning_rate": 9.322098914709726e-06, "loss": 0.0572, "step": 8807 }, { "epoch": 0.6857476521823755, "grad_norm": 0.5135887087832391, "learning_rate": 9.31787842264527e-06, "loss": 0.0536, "step": 8808 }, { "epoch": 0.6858255072745851, "grad_norm": 0.5809941030635715, "learning_rate": 9.31365859602678e-06, "loss": 0.0652, "step": 8809 }, { "epoch": 0.6859033623667948, "grad_norm": 0.5327759787707356, "learning_rate": 9.309439435117126e-06, "loss": 0.0585, "step": 8810 }, { "epoch": 0.6859812174590044, "grad_norm": 0.5308002728943664, "learning_rate": 9.305220940179148e-06, "loss": 0.0548, "step": 8811 }, { "epoch": 0.686059072551214, "grad_norm": 0.6068763381308888, "learning_rate": 9.301003111475625e-06, "loss": 0.073, "step": 8812 }, { "epoch": 0.6861369276434237, "grad_norm": 0.4522189216411862, "learning_rate": 9.296785949269329e-06, "loss": 0.0412, "step": 8813 }, { "epoch": 0.6862147827356333, "grad_norm": 0.5344294528454511, "learning_rate": 9.29256945382296e-06, "loss": 0.0639, "step": 8814 }, { "epoch": 0.6862926378278429, "grad_norm": 0.5169458330766713, "learning_rate": 9.288353625399181e-06, "loss": 0.0591, "step": 8815 }, { "epoch": 0.6863704929200526, "grad_norm": 0.6773522033028253, "learning_rate": 9.28413846426063e-06, "loss": 0.0638, "step": 8816 }, { "epoch": 0.6864483480122622, "grad_norm": 0.5083580836337457, "learning_rate": 9.279923970669871e-06, "loss": 0.0533, "step": 8817 }, { "epoch": 0.6865262031044718, "grad_norm": 0.580625691908068, "learning_rate": 9.275710144889465e-06, "loss": 0.0678, "step": 8818 }, { "epoch": 0.6866040581966815, "grad_norm": 0.5223788582402834, "learning_rate": 9.271496987181909e-06, "loss": 0.0562, "step": 8819 }, { "epoch": 0.686681913288891, "grad_norm": 0.5035896389201253, "learning_rate": 9.267284497809664e-06, "loss": 0.0542, "step": 8820 }, { "epoch": 0.6867597683811006, "grad_norm": 0.5459177635151222, "learning_rate": 9.263072677035144e-06, "loss": 0.0507, "step": 8821 }, { "epoch": 0.6868376234733103, "grad_norm": 0.6055030590762416, "learning_rate": 9.258861525120729e-06, "loss": 0.0794, "step": 8822 }, { "epoch": 0.6869154785655199, "grad_norm": 0.5216440723552842, "learning_rate": 9.254651042328746e-06, "loss": 0.0505, "step": 8823 }, { "epoch": 0.6869933336577295, "grad_norm": 0.6267317912937942, "learning_rate": 9.250441228921504e-06, "loss": 0.0868, "step": 8824 }, { "epoch": 0.6870711887499392, "grad_norm": 0.5483043677874202, "learning_rate": 9.24623208516125e-06, "loss": 0.0512, "step": 8825 }, { "epoch": 0.6871490438421488, "grad_norm": 0.5376496889958688, "learning_rate": 9.242023611310187e-06, "loss": 0.0459, "step": 8826 }, { "epoch": 0.6872268989343584, "grad_norm": 0.4580710067643532, "learning_rate": 9.237815807630483e-06, "loss": 0.0277, "step": 8827 }, { "epoch": 0.6873047540265681, "grad_norm": 0.4991949402124655, "learning_rate": 9.23360867438426e-06, "loss": 0.0543, "step": 8828 }, { "epoch": 0.6873826091187777, "grad_norm": 0.44997082427347773, "learning_rate": 9.229402211833618e-06, "loss": 0.0361, "step": 8829 }, { "epoch": 0.6874604642109873, "grad_norm": 0.5558280299843098, "learning_rate": 9.225196420240594e-06, "loss": 0.0792, "step": 8830 }, { "epoch": 0.687538319303197, "grad_norm": 0.5179509246844888, "learning_rate": 9.220991299867186e-06, "loss": 0.0591, "step": 8831 }, { "epoch": 0.6876161743954066, "grad_norm": 0.4924330955722952, "learning_rate": 9.216786850975352e-06, "loss": 0.0355, "step": 8832 }, { "epoch": 0.6876940294876162, "grad_norm": 0.5691998368365452, "learning_rate": 9.212583073827006e-06, "loss": 0.0751, "step": 8833 }, { "epoch": 0.6877718845798259, "grad_norm": 0.5946790214938782, "learning_rate": 9.208379968684042e-06, "loss": 0.0568, "step": 8834 }, { "epoch": 0.6878497396720354, "grad_norm": 0.5550700147813257, "learning_rate": 9.204177535808276e-06, "loss": 0.0667, "step": 8835 }, { "epoch": 0.687927594764245, "grad_norm": 0.5323190521581783, "learning_rate": 9.199975775461504e-06, "loss": 0.0532, "step": 8836 }, { "epoch": 0.6880054498564546, "grad_norm": 0.46945663910665514, "learning_rate": 9.195774687905477e-06, "loss": 0.043, "step": 8837 }, { "epoch": 0.6880833049486643, "grad_norm": 0.5335156423232071, "learning_rate": 9.191574273401896e-06, "loss": 0.0581, "step": 8838 }, { "epoch": 0.6881611600408739, "grad_norm": 0.5897529646698956, "learning_rate": 9.187374532212439e-06, "loss": 0.0596, "step": 8839 }, { "epoch": 0.6882390151330835, "grad_norm": 0.5865683749608736, "learning_rate": 9.183175464598724e-06, "loss": 0.0764, "step": 8840 }, { "epoch": 0.6883168702252932, "grad_norm": 0.644456271772112, "learning_rate": 9.178977070822337e-06, "loss": 0.0861, "step": 8841 }, { "epoch": 0.6883947253175028, "grad_norm": 0.4614632369861389, "learning_rate": 9.174779351144813e-06, "loss": 0.0559, "step": 8842 }, { "epoch": 0.6884725804097124, "grad_norm": 0.5849656282258687, "learning_rate": 9.170582305827653e-06, "loss": 0.0737, "step": 8843 }, { "epoch": 0.6885504355019221, "grad_norm": 0.5420088926012592, "learning_rate": 9.166385935132312e-06, "loss": 0.0569, "step": 8844 }, { "epoch": 0.6886282905941317, "grad_norm": 0.5569642935962422, "learning_rate": 9.162190239320206e-06, "loss": 0.0681, "step": 8845 }, { "epoch": 0.6887061456863413, "grad_norm": 0.49810485974565577, "learning_rate": 9.157995218652705e-06, "loss": 0.0523, "step": 8846 }, { "epoch": 0.688784000778551, "grad_norm": 0.486597481919463, "learning_rate": 9.15380087339114e-06, "loss": 0.051, "step": 8847 }, { "epoch": 0.6888618558707605, "grad_norm": 0.5434256100479776, "learning_rate": 9.149607203796792e-06, "loss": 0.0666, "step": 8848 }, { "epoch": 0.6889397109629701, "grad_norm": 0.5589353845170676, "learning_rate": 9.145414210130921e-06, "loss": 0.0628, "step": 8849 }, { "epoch": 0.6890175660551798, "grad_norm": 0.5158187124499303, "learning_rate": 9.141221892654723e-06, "loss": 0.0632, "step": 8850 }, { "epoch": 0.6890175660551798, "eval_loss": 0.0075673130340874195, "eval_runtime": 162.3599, "eval_samples_per_second": 17.738, "eval_steps_per_second": 0.634, "step": 8850 }, { "epoch": 0.6890954211473894, "grad_norm": 0.5997678969655714, "learning_rate": 9.137030251629361e-06, "loss": 0.068, "step": 8851 }, { "epoch": 0.689173276239599, "grad_norm": 0.5007802893677105, "learning_rate": 9.132839287315956e-06, "loss": 0.0576, "step": 8852 }, { "epoch": 0.6892511313318087, "grad_norm": 0.4925603379379714, "learning_rate": 9.12864899997558e-06, "loss": 0.0535, "step": 8853 }, { "epoch": 0.6893289864240183, "grad_norm": 0.5760975593814569, "learning_rate": 9.12445938986927e-06, "loss": 0.0678, "step": 8854 }, { "epoch": 0.6894068415162279, "grad_norm": 0.41311432859908215, "learning_rate": 9.120270457258025e-06, "loss": 0.0352, "step": 8855 }, { "epoch": 0.6894846966084376, "grad_norm": 0.4889630184647151, "learning_rate": 9.11608220240279e-06, "loss": 0.0497, "step": 8856 }, { "epoch": 0.6895625517006472, "grad_norm": 0.6598054064520891, "learning_rate": 9.111894625564474e-06, "loss": 0.0728, "step": 8857 }, { "epoch": 0.6896404067928568, "grad_norm": 0.5852073349871209, "learning_rate": 9.107707727003937e-06, "loss": 0.0757, "step": 8858 }, { "epoch": 0.6897182618850665, "grad_norm": 0.5119442456917901, "learning_rate": 9.103521506982019e-06, "loss": 0.0555, "step": 8859 }, { "epoch": 0.689796116977276, "grad_norm": 0.5169503102649583, "learning_rate": 9.09933596575949e-06, "loss": 0.0557, "step": 8860 }, { "epoch": 0.6898739720694856, "grad_norm": 0.4528533835038846, "learning_rate": 9.095151103597095e-06, "loss": 0.0424, "step": 8861 }, { "epoch": 0.6899518271616953, "grad_norm": 0.5353355050127736, "learning_rate": 9.090966920755529e-06, "loss": 0.0536, "step": 8862 }, { "epoch": 0.6900296822539049, "grad_norm": 0.5262848722325438, "learning_rate": 9.086783417495444e-06, "loss": 0.0566, "step": 8863 }, { "epoch": 0.6901075373461145, "grad_norm": 0.5141576614131683, "learning_rate": 9.082600594077459e-06, "loss": 0.0466, "step": 8864 }, { "epoch": 0.6901853924383242, "grad_norm": 0.5334496204995454, "learning_rate": 9.078418450762139e-06, "loss": 0.0537, "step": 8865 }, { "epoch": 0.6902632475305338, "grad_norm": 0.5442992615557796, "learning_rate": 9.074236987810015e-06, "loss": 0.0565, "step": 8866 }, { "epoch": 0.6903411026227434, "grad_norm": 0.54457343829368, "learning_rate": 9.07005620548157e-06, "loss": 0.0557, "step": 8867 }, { "epoch": 0.6904189577149531, "grad_norm": 0.47949059303356933, "learning_rate": 9.065876104037247e-06, "loss": 0.0431, "step": 8868 }, { "epoch": 0.6904968128071627, "grad_norm": 0.6187918061616199, "learning_rate": 9.061696683737442e-06, "loss": 0.0594, "step": 8869 }, { "epoch": 0.6905746678993723, "grad_norm": 0.527355056326331, "learning_rate": 9.057517944842524e-06, "loss": 0.0537, "step": 8870 }, { "epoch": 0.6906525229915819, "grad_norm": 0.47753111981471874, "learning_rate": 9.053339887612807e-06, "loss": 0.0316, "step": 8871 }, { "epoch": 0.6907303780837916, "grad_norm": 0.46061062824339216, "learning_rate": 9.04916251230856e-06, "loss": 0.048, "step": 8872 }, { "epoch": 0.6908082331760012, "grad_norm": 0.637367041113896, "learning_rate": 9.044985819190015e-06, "loss": 0.0735, "step": 8873 }, { "epoch": 0.6908860882682107, "grad_norm": 0.5227084938622384, "learning_rate": 9.040809808517361e-06, "loss": 0.0558, "step": 8874 }, { "epoch": 0.6909639433604204, "grad_norm": 0.5537936095357, "learning_rate": 9.036634480550741e-06, "loss": 0.0668, "step": 8875 }, { "epoch": 0.69104179845263, "grad_norm": 0.5483269614444715, "learning_rate": 9.032459835550262e-06, "loss": 0.0535, "step": 8876 }, { "epoch": 0.6911196535448396, "grad_norm": 0.5130778291169359, "learning_rate": 9.028285873775983e-06, "loss": 0.0609, "step": 8877 }, { "epoch": 0.6911975086370493, "grad_norm": 0.56739046154281, "learning_rate": 9.024112595487925e-06, "loss": 0.0726, "step": 8878 }, { "epoch": 0.6912753637292589, "grad_norm": 0.4577969933027556, "learning_rate": 9.01994000094605e-06, "loss": 0.0414, "step": 8879 }, { "epoch": 0.6913532188214685, "grad_norm": 0.5197076723724746, "learning_rate": 9.015768090410311e-06, "loss": 0.0593, "step": 8880 }, { "epoch": 0.6914310739136782, "grad_norm": 0.4750179242020599, "learning_rate": 9.011596864140588e-06, "loss": 0.0352, "step": 8881 }, { "epoch": 0.6915089290058878, "grad_norm": 0.6331495662314959, "learning_rate": 9.007426322396732e-06, "loss": 0.0604, "step": 8882 }, { "epoch": 0.6915867840980974, "grad_norm": 0.5066349390005305, "learning_rate": 9.003256465438555e-06, "loss": 0.0419, "step": 8883 }, { "epoch": 0.6916646391903071, "grad_norm": 0.5662831447369264, "learning_rate": 8.999087293525794e-06, "loss": 0.0545, "step": 8884 }, { "epoch": 0.6917424942825167, "grad_norm": 0.4733622226749668, "learning_rate": 8.994918806918194e-06, "loss": 0.0466, "step": 8885 }, { "epoch": 0.6918203493747263, "grad_norm": 0.5522366113377025, "learning_rate": 8.990751005875422e-06, "loss": 0.0492, "step": 8886 }, { "epoch": 0.691898204466936, "grad_norm": 0.5371984374379579, "learning_rate": 8.986583890657117e-06, "loss": 0.0606, "step": 8887 }, { "epoch": 0.6919760595591455, "grad_norm": 0.4891472731223879, "learning_rate": 8.982417461522868e-06, "loss": 0.0464, "step": 8888 }, { "epoch": 0.6920539146513551, "grad_norm": 0.5650005467571251, "learning_rate": 8.978251718732214e-06, "loss": 0.0627, "step": 8889 }, { "epoch": 0.6921317697435648, "grad_norm": 0.4394086845018157, "learning_rate": 8.974086662544681e-06, "loss": 0.043, "step": 8890 }, { "epoch": 0.6922096248357744, "grad_norm": 0.6468334038067969, "learning_rate": 8.969922293219722e-06, "loss": 0.0842, "step": 8891 }, { "epoch": 0.692287479927984, "grad_norm": 0.43772043782319764, "learning_rate": 8.965758611016764e-06, "loss": 0.0371, "step": 8892 }, { "epoch": 0.6923653350201937, "grad_norm": 0.557213282656511, "learning_rate": 8.961595616195173e-06, "loss": 0.0588, "step": 8893 }, { "epoch": 0.6924431901124033, "grad_norm": 0.5594387532570015, "learning_rate": 8.957433309014282e-06, "loss": 0.0552, "step": 8894 }, { "epoch": 0.6925210452046129, "grad_norm": 0.5120725914392553, "learning_rate": 8.953271689733398e-06, "loss": 0.0487, "step": 8895 }, { "epoch": 0.6925989002968226, "grad_norm": 0.4531216328452374, "learning_rate": 8.949110758611765e-06, "loss": 0.0431, "step": 8896 }, { "epoch": 0.6926767553890322, "grad_norm": 0.4654269890563192, "learning_rate": 8.944950515908588e-06, "loss": 0.033, "step": 8897 }, { "epoch": 0.6927546104812418, "grad_norm": 0.45496549316765283, "learning_rate": 8.94079096188303e-06, "loss": 0.0374, "step": 8898 }, { "epoch": 0.6928324655734515, "grad_norm": 0.5812793412317555, "learning_rate": 8.936632096794213e-06, "loss": 0.0657, "step": 8899 }, { "epoch": 0.692910320665661, "grad_norm": 0.5255911150527474, "learning_rate": 8.932473920901206e-06, "loss": 0.0472, "step": 8900 }, { "epoch": 0.692910320665661, "eval_loss": 0.007468670140951872, "eval_runtime": 162.2653, "eval_samples_per_second": 17.749, "eval_steps_per_second": 0.635, "step": 8900 }, { "epoch": 0.6929881757578706, "grad_norm": 0.5351398707826379, "learning_rate": 8.928316434463064e-06, "loss": 0.0597, "step": 8901 }, { "epoch": 0.6930660308500803, "grad_norm": 0.5669140870884163, "learning_rate": 8.924159637738763e-06, "loss": 0.0605, "step": 8902 }, { "epoch": 0.6931438859422899, "grad_norm": 0.4942046689777641, "learning_rate": 8.920003530987254e-06, "loss": 0.0393, "step": 8903 }, { "epoch": 0.6932217410344995, "grad_norm": 0.45674685672317317, "learning_rate": 8.915848114467445e-06, "loss": 0.0395, "step": 8904 }, { "epoch": 0.6932995961267092, "grad_norm": 0.7922608359985202, "learning_rate": 8.91169338843819e-06, "loss": 0.1042, "step": 8905 }, { "epoch": 0.6933774512189188, "grad_norm": 0.462383445931993, "learning_rate": 8.907539353158325e-06, "loss": 0.0322, "step": 8906 }, { "epoch": 0.6934553063111284, "grad_norm": 0.4879758985178211, "learning_rate": 8.90338600888662e-06, "loss": 0.0408, "step": 8907 }, { "epoch": 0.693533161403338, "grad_norm": 0.5862087956972934, "learning_rate": 8.899233355881807e-06, "loss": 0.0622, "step": 8908 }, { "epoch": 0.6936110164955477, "grad_norm": 0.49374726052720735, "learning_rate": 8.895081394402578e-06, "loss": 0.054, "step": 8909 }, { "epoch": 0.6936888715877573, "grad_norm": 0.5541829727192058, "learning_rate": 8.89093012470758e-06, "loss": 0.0571, "step": 8910 }, { "epoch": 0.6937667266799669, "grad_norm": 0.5236311943317553, "learning_rate": 8.886779547055418e-06, "loss": 0.0448, "step": 8911 }, { "epoch": 0.6938445817721766, "grad_norm": 0.5696462371152059, "learning_rate": 8.882629661704654e-06, "loss": 0.0625, "step": 8912 }, { "epoch": 0.6939224368643861, "grad_norm": 0.4908704127108599, "learning_rate": 8.878480468913806e-06, "loss": 0.0383, "step": 8913 }, { "epoch": 0.6940002919565957, "grad_norm": 0.5820277429445058, "learning_rate": 8.874331968941348e-06, "loss": 0.0715, "step": 8914 }, { "epoch": 0.6940781470488054, "grad_norm": 0.5572001652987464, "learning_rate": 8.870184162045707e-06, "loss": 0.0733, "step": 8915 }, { "epoch": 0.694156002141015, "grad_norm": 0.5715332688188995, "learning_rate": 8.866037048485285e-06, "loss": 0.0745, "step": 8916 }, { "epoch": 0.6942338572332246, "grad_norm": 0.5744230135591303, "learning_rate": 8.86189062851842e-06, "loss": 0.0689, "step": 8917 }, { "epoch": 0.6943117123254343, "grad_norm": 0.5692594206915926, "learning_rate": 8.857744902403415e-06, "loss": 0.0621, "step": 8918 }, { "epoch": 0.6943895674176439, "grad_norm": 0.4501415834906116, "learning_rate": 8.85359987039853e-06, "loss": 0.0412, "step": 8919 }, { "epoch": 0.6944674225098535, "grad_norm": 0.5984196959209318, "learning_rate": 8.84945553276198e-06, "loss": 0.0774, "step": 8920 }, { "epoch": 0.6945452776020632, "grad_norm": 0.6110691977481509, "learning_rate": 8.845311889751937e-06, "loss": 0.0757, "step": 8921 }, { "epoch": 0.6946231326942728, "grad_norm": 0.46800928493069005, "learning_rate": 8.841168941626534e-06, "loss": 0.0381, "step": 8922 }, { "epoch": 0.6947009877864824, "grad_norm": 0.49018438503634876, "learning_rate": 8.837026688643852e-06, "loss": 0.0388, "step": 8923 }, { "epoch": 0.6947788428786921, "grad_norm": 0.43050838814870124, "learning_rate": 8.832885131061939e-06, "loss": 0.0375, "step": 8924 }, { "epoch": 0.6948566979709017, "grad_norm": 0.4669939396950246, "learning_rate": 8.828744269138784e-06, "loss": 0.0488, "step": 8925 }, { "epoch": 0.6949345530631112, "grad_norm": 0.583101428676794, "learning_rate": 8.82460410313236e-06, "loss": 0.0608, "step": 8926 }, { "epoch": 0.695012408155321, "grad_norm": 0.5856327413704425, "learning_rate": 8.820464633300571e-06, "loss": 0.0734, "step": 8927 }, { "epoch": 0.6950902632475305, "grad_norm": 0.48193588781038904, "learning_rate": 8.816325859901288e-06, "loss": 0.0564, "step": 8928 }, { "epoch": 0.6951681183397401, "grad_norm": 0.5294369402166089, "learning_rate": 8.812187783192333e-06, "loss": 0.0506, "step": 8929 }, { "epoch": 0.6952459734319498, "grad_norm": 0.5198271506131338, "learning_rate": 8.808050403431492e-06, "loss": 0.0548, "step": 8930 }, { "epoch": 0.6953238285241594, "grad_norm": 0.47093020899011456, "learning_rate": 8.803913720876507e-06, "loss": 0.053, "step": 8931 }, { "epoch": 0.695401683616369, "grad_norm": 0.5154999623967841, "learning_rate": 8.799777735785068e-06, "loss": 0.0454, "step": 8932 }, { "epoch": 0.6954795387085787, "grad_norm": 0.5017389936041141, "learning_rate": 8.795642448414832e-06, "loss": 0.0487, "step": 8933 }, { "epoch": 0.6955573938007883, "grad_norm": 0.4804442767962452, "learning_rate": 8.791507859023405e-06, "loss": 0.0485, "step": 8934 }, { "epoch": 0.6956352488929979, "grad_norm": 0.49541085112564803, "learning_rate": 8.787373967868346e-06, "loss": 0.0502, "step": 8935 }, { "epoch": 0.6957131039852076, "grad_norm": 0.5162822321413779, "learning_rate": 8.783240775207196e-06, "loss": 0.0536, "step": 8936 }, { "epoch": 0.6957909590774172, "grad_norm": 0.5652553043809462, "learning_rate": 8.77910828129742e-06, "loss": 0.0617, "step": 8937 }, { "epoch": 0.6958688141696268, "grad_norm": 0.5363497630119671, "learning_rate": 8.774976486396453e-06, "loss": 0.0539, "step": 8938 }, { "epoch": 0.6959466692618365, "grad_norm": 0.4519718025744054, "learning_rate": 8.770845390761691e-06, "loss": 0.0529, "step": 8939 }, { "epoch": 0.696024524354046, "grad_norm": 0.49718358363579246, "learning_rate": 8.766714994650478e-06, "loss": 0.0553, "step": 8940 }, { "epoch": 0.6961023794462556, "grad_norm": 0.6127238522298903, "learning_rate": 8.76258529832012e-06, "loss": 0.0879, "step": 8941 }, { "epoch": 0.6961802345384653, "grad_norm": 0.5751582326918256, "learning_rate": 8.75845630202788e-06, "loss": 0.0765, "step": 8942 }, { "epoch": 0.6962580896306749, "grad_norm": 0.4907210484168231, "learning_rate": 8.75432800603097e-06, "loss": 0.0468, "step": 8943 }, { "epoch": 0.6963359447228845, "grad_norm": 0.4230610697343231, "learning_rate": 8.750200410586566e-06, "loss": 0.0373, "step": 8944 }, { "epoch": 0.6964137998150941, "grad_norm": 0.5636725821241717, "learning_rate": 8.746073515951799e-06, "loss": 0.0697, "step": 8945 }, { "epoch": 0.6964916549073038, "grad_norm": 0.5667594157716035, "learning_rate": 8.741947322383745e-06, "loss": 0.0667, "step": 8946 }, { "epoch": 0.6965695099995134, "grad_norm": 0.5350537072694446, "learning_rate": 8.737821830139463e-06, "loss": 0.0682, "step": 8947 }, { "epoch": 0.696647365091723, "grad_norm": 0.4503181621502275, "learning_rate": 8.733697039475946e-06, "loss": 0.044, "step": 8948 }, { "epoch": 0.6967252201839327, "grad_norm": 0.48180006350364807, "learning_rate": 8.729572950650144e-06, "loss": 0.0526, "step": 8949 }, { "epoch": 0.6968030752761423, "grad_norm": 0.5997757539096494, "learning_rate": 8.725449563918981e-06, "loss": 0.0807, "step": 8950 }, { "epoch": 0.6968030752761423, "eval_loss": 0.007351218722760677, "eval_runtime": 162.2362, "eval_samples_per_second": 17.752, "eval_steps_per_second": 0.635, "step": 8950 }, { "epoch": 0.6968809303683519, "grad_norm": 0.5984621204927952, "learning_rate": 8.7213268795393e-06, "loss": 0.0593, "step": 8951 }, { "epoch": 0.6969587854605616, "grad_norm": 0.5737889580387467, "learning_rate": 8.717204897767948e-06, "loss": 0.0776, "step": 8952 }, { "epoch": 0.6970366405527711, "grad_norm": 0.5190981262697041, "learning_rate": 8.713083618861695e-06, "loss": 0.0418, "step": 8953 }, { "epoch": 0.6971144956449807, "grad_norm": 0.5166941912032483, "learning_rate": 8.708963043077281e-06, "loss": 0.0424, "step": 8954 }, { "epoch": 0.6971923507371904, "grad_norm": 0.5495643429186525, "learning_rate": 8.704843170671395e-06, "loss": 0.0585, "step": 8955 }, { "epoch": 0.6972702058294, "grad_norm": 0.6488259971946226, "learning_rate": 8.700724001900683e-06, "loss": 0.0654, "step": 8956 }, { "epoch": 0.6973480609216096, "grad_norm": 0.5806131801573062, "learning_rate": 8.696605537021761e-06, "loss": 0.074, "step": 8957 }, { "epoch": 0.6974259160138193, "grad_norm": 0.5806134617296165, "learning_rate": 8.692487776291181e-06, "loss": 0.0617, "step": 8958 }, { "epoch": 0.6975037711060289, "grad_norm": 0.44024686280396097, "learning_rate": 8.688370719965473e-06, "loss": 0.0446, "step": 8959 }, { "epoch": 0.6975816261982385, "grad_norm": 0.5847707120372286, "learning_rate": 8.684254368301091e-06, "loss": 0.0664, "step": 8960 }, { "epoch": 0.6976594812904482, "grad_norm": 0.43883068710303863, "learning_rate": 8.680138721554465e-06, "loss": 0.0416, "step": 8961 }, { "epoch": 0.6977373363826578, "grad_norm": 0.5120774333255728, "learning_rate": 8.676023779981996e-06, "loss": 0.0519, "step": 8962 }, { "epoch": 0.6978151914748674, "grad_norm": 0.47008833355263613, "learning_rate": 8.67190954384002e-06, "loss": 0.0563, "step": 8963 }, { "epoch": 0.6978930465670771, "grad_norm": 0.5663541250765791, "learning_rate": 8.667796013384831e-06, "loss": 0.0753, "step": 8964 }, { "epoch": 0.6979709016592867, "grad_norm": 0.5247832767404789, "learning_rate": 8.663683188872686e-06, "loss": 0.0585, "step": 8965 }, { "epoch": 0.6980487567514962, "grad_norm": 0.6003864574316162, "learning_rate": 8.659571070559784e-06, "loss": 0.0838, "step": 8966 }, { "epoch": 0.698126611843706, "grad_norm": 0.5921016120689329, "learning_rate": 8.655459658702308e-06, "loss": 0.0842, "step": 8967 }, { "epoch": 0.6982044669359155, "grad_norm": 0.4324221241856997, "learning_rate": 8.651348953556377e-06, "loss": 0.0413, "step": 8968 }, { "epoch": 0.6982823220281251, "grad_norm": 0.5098213126093308, "learning_rate": 8.647238955378056e-06, "loss": 0.0539, "step": 8969 }, { "epoch": 0.6983601771203348, "grad_norm": 0.4923255579396652, "learning_rate": 8.643129664423387e-06, "loss": 0.0488, "step": 8970 }, { "epoch": 0.6984380322125444, "grad_norm": 0.5280205346597834, "learning_rate": 8.639021080948349e-06, "loss": 0.0509, "step": 8971 }, { "epoch": 0.698515887304754, "grad_norm": 0.5120454691997509, "learning_rate": 8.634913205208903e-06, "loss": 0.0481, "step": 8972 }, { "epoch": 0.6985937423969637, "grad_norm": 0.49296590019525416, "learning_rate": 8.630806037460945e-06, "loss": 0.0478, "step": 8973 }, { "epoch": 0.6986715974891733, "grad_norm": 0.5285138886942395, "learning_rate": 8.626699577960332e-06, "loss": 0.0539, "step": 8974 }, { "epoch": 0.6987494525813829, "grad_norm": 0.5397934978133505, "learning_rate": 8.622593826962873e-06, "loss": 0.0592, "step": 8975 }, { "epoch": 0.6988273076735926, "grad_norm": 0.5871190846989651, "learning_rate": 8.61848878472434e-06, "loss": 0.0765, "step": 8976 }, { "epoch": 0.6989051627658022, "grad_norm": 0.49010444751594456, "learning_rate": 8.614384451500461e-06, "loss": 0.0491, "step": 8977 }, { "epoch": 0.6989830178580118, "grad_norm": 0.5945746660291817, "learning_rate": 8.610280827546913e-06, "loss": 0.0678, "step": 8978 }, { "epoch": 0.6990608729502213, "grad_norm": 0.6007266437628831, "learning_rate": 8.606177913119333e-06, "loss": 0.0729, "step": 8979 }, { "epoch": 0.699138728042431, "grad_norm": 0.645965031297783, "learning_rate": 8.602075708473312e-06, "loss": 0.0919, "step": 8980 }, { "epoch": 0.6992165831346406, "grad_norm": 0.43815507316448854, "learning_rate": 8.5979742138644e-06, "loss": 0.0349, "step": 8981 }, { "epoch": 0.6992944382268502, "grad_norm": 0.5784011011509511, "learning_rate": 8.593873429548092e-06, "loss": 0.0802, "step": 8982 }, { "epoch": 0.6993722933190599, "grad_norm": 0.4363619688300399, "learning_rate": 8.589773355779865e-06, "loss": 0.0367, "step": 8983 }, { "epoch": 0.6994501484112695, "grad_norm": 0.555922017038763, "learning_rate": 8.585673992815125e-06, "loss": 0.0677, "step": 8984 }, { "epoch": 0.6995280035034791, "grad_norm": 0.49543188875959526, "learning_rate": 8.58157534090924e-06, "loss": 0.056, "step": 8985 }, { "epoch": 0.6996058585956888, "grad_norm": 0.5342323546414359, "learning_rate": 8.577477400317545e-06, "loss": 0.0348, "step": 8986 }, { "epoch": 0.6996837136878984, "grad_norm": 0.5033977690970648, "learning_rate": 8.573380171295314e-06, "loss": 0.0521, "step": 8987 }, { "epoch": 0.699761568780108, "grad_norm": 0.5136264693491556, "learning_rate": 8.569283654097788e-06, "loss": 0.0465, "step": 8988 }, { "epoch": 0.6998394238723177, "grad_norm": 0.5775539790367261, "learning_rate": 8.565187848980163e-06, "loss": 0.0669, "step": 8989 }, { "epoch": 0.6999172789645273, "grad_norm": 0.4382066361089864, "learning_rate": 8.561092756197584e-06, "loss": 0.0445, "step": 8990 }, { "epoch": 0.6999951340567369, "grad_norm": 0.5319535107638393, "learning_rate": 8.556998376005161e-06, "loss": 0.0617, "step": 8991 }, { "epoch": 0.7000729891489466, "grad_norm": 0.575770424321825, "learning_rate": 8.552904708657945e-06, "loss": 0.05, "step": 8992 }, { "epoch": 0.7001508442411561, "grad_norm": 0.5018172374965794, "learning_rate": 8.548811754410965e-06, "loss": 0.0589, "step": 8993 }, { "epoch": 0.7002286993333657, "grad_norm": 0.5520879620416902, "learning_rate": 8.544719513519191e-06, "loss": 0.064, "step": 8994 }, { "epoch": 0.7003065544255754, "grad_norm": 0.3877588647666821, "learning_rate": 8.540627986237543e-06, "loss": 0.0282, "step": 8995 }, { "epoch": 0.700384409517785, "grad_norm": 0.5642110593597977, "learning_rate": 8.536537172820911e-06, "loss": 0.0612, "step": 8996 }, { "epoch": 0.7004622646099946, "grad_norm": 0.47983999647254216, "learning_rate": 8.532447073524129e-06, "loss": 0.0532, "step": 8997 }, { "epoch": 0.7005401197022043, "grad_norm": 0.5713370172950064, "learning_rate": 8.528357688601989e-06, "loss": 0.0711, "step": 8998 }, { "epoch": 0.7006179747944139, "grad_norm": 0.5563548782019281, "learning_rate": 8.524269018309248e-06, "loss": 0.0489, "step": 8999 }, { "epoch": 0.7006958298866235, "grad_norm": 0.38471216804796005, "learning_rate": 8.520181062900604e-06, "loss": 0.0302, "step": 9000 }, { "epoch": 0.7006958298866235, "eval_loss": 0.007248206529766321, "eval_runtime": 162.8058, "eval_samples_per_second": 17.69, "eval_steps_per_second": 0.633, "step": 9000 }, { "epoch": 0.7007736849788332, "grad_norm": 0.5487877361091797, "learning_rate": 8.51609382263072e-06, "loss": 0.051, "step": 9001 }, { "epoch": 0.7008515400710428, "grad_norm": 0.4569539105980525, "learning_rate": 8.512007297754204e-06, "loss": 0.0405, "step": 9002 }, { "epoch": 0.7009293951632524, "grad_norm": 0.5609411930138685, "learning_rate": 8.507921488525644e-06, "loss": 0.0515, "step": 9003 }, { "epoch": 0.7010072502554621, "grad_norm": 0.46405600890338067, "learning_rate": 8.503836395199556e-06, "loss": 0.0421, "step": 9004 }, { "epoch": 0.7010851053476717, "grad_norm": 0.5312646413294677, "learning_rate": 8.499752018030424e-06, "loss": 0.0596, "step": 9005 }, { "epoch": 0.7011629604398812, "grad_norm": 0.44996548859455876, "learning_rate": 8.495668357272686e-06, "loss": 0.0288, "step": 9006 }, { "epoch": 0.7012408155320909, "grad_norm": 0.6249390204536784, "learning_rate": 8.491585413180734e-06, "loss": 0.0765, "step": 9007 }, { "epoch": 0.7013186706243005, "grad_norm": 0.5633760120124317, "learning_rate": 8.487503186008916e-06, "loss": 0.0602, "step": 9008 }, { "epoch": 0.7013965257165101, "grad_norm": 0.42843166556593204, "learning_rate": 8.483421676011537e-06, "loss": 0.0334, "step": 9009 }, { "epoch": 0.7014743808087198, "grad_norm": 0.5532005546603651, "learning_rate": 8.479340883442855e-06, "loss": 0.0659, "step": 9010 }, { "epoch": 0.7015522359009294, "grad_norm": 0.47921325890542293, "learning_rate": 8.475260808557084e-06, "loss": 0.0415, "step": 9011 }, { "epoch": 0.701630090993139, "grad_norm": 0.4454044534294584, "learning_rate": 8.471181451608385e-06, "loss": 0.0404, "step": 9012 }, { "epoch": 0.7017079460853487, "grad_norm": 0.4590194038980952, "learning_rate": 8.4671028128509e-06, "loss": 0.0376, "step": 9013 }, { "epoch": 0.7017858011775583, "grad_norm": 0.5803933975446426, "learning_rate": 8.463024892538699e-06, "loss": 0.0706, "step": 9014 }, { "epoch": 0.7018636562697679, "grad_norm": 0.6279032853035643, "learning_rate": 8.458947690925818e-06, "loss": 0.0876, "step": 9015 }, { "epoch": 0.7019415113619775, "grad_norm": 0.6185485523992879, "learning_rate": 8.45487120826625e-06, "loss": 0.0679, "step": 9016 }, { "epoch": 0.7020193664541872, "grad_norm": 0.6198502791115511, "learning_rate": 8.450795444813947e-06, "loss": 0.0844, "step": 9017 }, { "epoch": 0.7020972215463968, "grad_norm": 0.5354785392989516, "learning_rate": 8.446720400822785e-06, "loss": 0.0535, "step": 9018 }, { "epoch": 0.7021750766386063, "grad_norm": 0.5786559128154162, "learning_rate": 8.442646076546646e-06, "loss": 0.0616, "step": 9019 }, { "epoch": 0.702252931730816, "grad_norm": 0.5664580280560239, "learning_rate": 8.438572472239331e-06, "loss": 0.0599, "step": 9020 }, { "epoch": 0.7023307868230256, "grad_norm": 0.559883593759786, "learning_rate": 8.434499588154606e-06, "loss": 0.0594, "step": 9021 }, { "epoch": 0.7024086419152352, "grad_norm": 0.5831086309810166, "learning_rate": 8.430427424546195e-06, "loss": 0.0712, "step": 9022 }, { "epoch": 0.7024864970074449, "grad_norm": 0.5020124464725743, "learning_rate": 8.426355981667767e-06, "loss": 0.0504, "step": 9023 }, { "epoch": 0.7025643520996545, "grad_norm": 0.6401617239556684, "learning_rate": 8.422285259772968e-06, "loss": 0.0645, "step": 9024 }, { "epoch": 0.7026422071918641, "grad_norm": 0.4935805330683849, "learning_rate": 8.418215259115376e-06, "loss": 0.0466, "step": 9025 }, { "epoch": 0.7027200622840738, "grad_norm": 0.5080048118146449, "learning_rate": 8.414145979948542e-06, "loss": 0.066, "step": 9026 }, { "epoch": 0.7027979173762834, "grad_norm": 0.5884284265655358, "learning_rate": 8.410077422525948e-06, "loss": 0.0677, "step": 9027 }, { "epoch": 0.702875772468493, "grad_norm": 0.5654059855807957, "learning_rate": 8.406009587101047e-06, "loss": 0.061, "step": 9028 }, { "epoch": 0.7029536275607027, "grad_norm": 0.5243048579773133, "learning_rate": 8.401942473927258e-06, "loss": 0.0625, "step": 9029 }, { "epoch": 0.7030314826529123, "grad_norm": 0.5741722030994967, "learning_rate": 8.397876083257937e-06, "loss": 0.0632, "step": 9030 }, { "epoch": 0.7031093377451219, "grad_norm": 0.5018209871561408, "learning_rate": 8.393810415346402e-06, "loss": 0.0536, "step": 9031 }, { "epoch": 0.7031871928373316, "grad_norm": 0.5728713434592145, "learning_rate": 8.389745470445924e-06, "loss": 0.0623, "step": 9032 }, { "epoch": 0.7032650479295411, "grad_norm": 0.43605876422367384, "learning_rate": 8.385681248809723e-06, "loss": 0.0347, "step": 9033 }, { "epoch": 0.7033429030217507, "grad_norm": 0.5169229781029511, "learning_rate": 8.381617750690996e-06, "loss": 0.0599, "step": 9034 }, { "epoch": 0.7034207581139604, "grad_norm": 0.5100925701940444, "learning_rate": 8.377554976342876e-06, "loss": 0.0398, "step": 9035 }, { "epoch": 0.70349861320617, "grad_norm": 0.5207771314869496, "learning_rate": 8.373492926018448e-06, "loss": 0.0558, "step": 9036 }, { "epoch": 0.7035764682983796, "grad_norm": 0.5766606924839441, "learning_rate": 8.369431599970756e-06, "loss": 0.0535, "step": 9037 }, { "epoch": 0.7036543233905893, "grad_norm": 0.49007836451499487, "learning_rate": 8.365370998452805e-06, "loss": 0.0432, "step": 9038 }, { "epoch": 0.7037321784827989, "grad_norm": 0.5969474460801786, "learning_rate": 8.361311121717556e-06, "loss": 0.0738, "step": 9039 }, { "epoch": 0.7038100335750085, "grad_norm": 0.5246569995606495, "learning_rate": 8.357251970017921e-06, "loss": 0.0501, "step": 9040 }, { "epoch": 0.7038878886672182, "grad_norm": 0.6233722080309294, "learning_rate": 8.353193543606761e-06, "loss": 0.0746, "step": 9041 }, { "epoch": 0.7039657437594278, "grad_norm": 0.4936447518503055, "learning_rate": 8.349135842736898e-06, "loss": 0.0527, "step": 9042 }, { "epoch": 0.7040435988516374, "grad_norm": 0.686149146972374, "learning_rate": 8.3450788676611e-06, "loss": 0.0857, "step": 9043 }, { "epoch": 0.7041214539438471, "grad_norm": 0.5077531565094104, "learning_rate": 8.341022618632121e-06, "loss": 0.0442, "step": 9044 }, { "epoch": 0.7041993090360567, "grad_norm": 0.553555857540635, "learning_rate": 8.336967095902627e-06, "loss": 0.0661, "step": 9045 }, { "epoch": 0.7042771641282662, "grad_norm": 0.573202877231295, "learning_rate": 8.332912299725257e-06, "loss": 0.0512, "step": 9046 }, { "epoch": 0.7043550192204759, "grad_norm": 0.657963247682897, "learning_rate": 8.328858230352612e-06, "loss": 0.0973, "step": 9047 }, { "epoch": 0.7044328743126855, "grad_norm": 0.5467480526325704, "learning_rate": 8.324804888037234e-06, "loss": 0.0482, "step": 9048 }, { "epoch": 0.7045107294048951, "grad_norm": 0.5125544682917534, "learning_rate": 8.32075227303164e-06, "loss": 0.0572, "step": 9049 }, { "epoch": 0.7045885844971047, "grad_norm": 0.5502729039507407, "learning_rate": 8.31670038558828e-06, "loss": 0.0647, "step": 9050 }, { "epoch": 0.7045885844971047, "eval_loss": 0.007202477660030127, "eval_runtime": 162.3387, "eval_samples_per_second": 17.741, "eval_steps_per_second": 0.634, "step": 9050 }, { "epoch": 0.7046664395893144, "grad_norm": 0.6034215437687407, "learning_rate": 8.312649225959572e-06, "loss": 0.0844, "step": 9051 }, { "epoch": 0.704744294681524, "grad_norm": 0.4703485744463827, "learning_rate": 8.308598794397881e-06, "loss": 0.0548, "step": 9052 }, { "epoch": 0.7048221497737336, "grad_norm": 0.510895684400087, "learning_rate": 8.304549091155532e-06, "loss": 0.0564, "step": 9053 }, { "epoch": 0.7049000048659433, "grad_norm": 0.5373733946387105, "learning_rate": 8.3005001164848e-06, "loss": 0.0502, "step": 9054 }, { "epoch": 0.7049778599581529, "grad_norm": 0.5414064197674086, "learning_rate": 8.296451870637916e-06, "loss": 0.0542, "step": 9055 }, { "epoch": 0.7050557150503625, "grad_norm": 0.4787778003500319, "learning_rate": 8.292404353867072e-06, "loss": 0.047, "step": 9056 }, { "epoch": 0.7051335701425722, "grad_norm": 0.5421524541531028, "learning_rate": 8.288357566424406e-06, "loss": 0.0624, "step": 9057 }, { "epoch": 0.7052114252347818, "grad_norm": 0.4754184785024631, "learning_rate": 8.284311508562013e-06, "loss": 0.0567, "step": 9058 }, { "epoch": 0.7052892803269913, "grad_norm": 0.5556293046362839, "learning_rate": 8.280266180531937e-06, "loss": 0.063, "step": 9059 }, { "epoch": 0.705367135419201, "grad_norm": 0.475983086290371, "learning_rate": 8.2762215825862e-06, "loss": 0.0531, "step": 9060 }, { "epoch": 0.7054449905114106, "grad_norm": 0.6550225505545711, "learning_rate": 8.272177714976747e-06, "loss": 0.0775, "step": 9061 }, { "epoch": 0.7055228456036202, "grad_norm": 0.5016763564470756, "learning_rate": 8.2681345779555e-06, "loss": 0.0454, "step": 9062 }, { "epoch": 0.7056007006958299, "grad_norm": 0.5675010752789517, "learning_rate": 8.264092171774325e-06, "loss": 0.0656, "step": 9063 }, { "epoch": 0.7056785557880395, "grad_norm": 0.5205204126899283, "learning_rate": 8.260050496685042e-06, "loss": 0.0477, "step": 9064 }, { "epoch": 0.7057564108802491, "grad_norm": 0.5555791228589083, "learning_rate": 8.25600955293943e-06, "loss": 0.0592, "step": 9065 }, { "epoch": 0.7058342659724588, "grad_norm": 0.5847145237119327, "learning_rate": 8.251969340789224e-06, "loss": 0.0585, "step": 9066 }, { "epoch": 0.7059121210646684, "grad_norm": 0.39790531640397203, "learning_rate": 8.247929860486106e-06, "loss": 0.0265, "step": 9067 }, { "epoch": 0.705989976156878, "grad_norm": 0.4832899817448365, "learning_rate": 8.243891112281716e-06, "loss": 0.0445, "step": 9068 }, { "epoch": 0.7060678312490877, "grad_norm": 0.7254332470284941, "learning_rate": 8.239853096427647e-06, "loss": 0.098, "step": 9069 }, { "epoch": 0.7061456863412973, "grad_norm": 0.5023142169284114, "learning_rate": 8.235815813175459e-06, "loss": 0.0425, "step": 9070 }, { "epoch": 0.7062235414335069, "grad_norm": 0.47993392050724043, "learning_rate": 8.23177926277665e-06, "loss": 0.0472, "step": 9071 }, { "epoch": 0.7063013965257166, "grad_norm": 0.5733154417854035, "learning_rate": 8.227743445482674e-06, "loss": 0.0794, "step": 9072 }, { "epoch": 0.7063792516179261, "grad_norm": 0.5026506569620893, "learning_rate": 8.22370836154495e-06, "loss": 0.052, "step": 9073 }, { "epoch": 0.7064571067101357, "grad_norm": 0.5028922587101203, "learning_rate": 8.21967401121484e-06, "loss": 0.0595, "step": 9074 }, { "epoch": 0.7065349618023454, "grad_norm": 0.4408460464829643, "learning_rate": 8.215640394743666e-06, "loss": 0.0381, "step": 9075 }, { "epoch": 0.706612816894555, "grad_norm": 0.48710930710142636, "learning_rate": 8.211607512382707e-06, "loss": 0.0441, "step": 9076 }, { "epoch": 0.7066906719867646, "grad_norm": 0.4627028174144491, "learning_rate": 8.207575364383188e-06, "loss": 0.045, "step": 9077 }, { "epoch": 0.7067685270789743, "grad_norm": 0.5636873258365288, "learning_rate": 8.203543950996295e-06, "loss": 0.0626, "step": 9078 }, { "epoch": 0.7068463821711839, "grad_norm": 0.5046172913064148, "learning_rate": 8.19951327247316e-06, "loss": 0.0571, "step": 9079 }, { "epoch": 0.7069242372633935, "grad_norm": 0.6341773439989387, "learning_rate": 8.195483329064883e-06, "loss": 0.0869, "step": 9080 }, { "epoch": 0.7070020923556032, "grad_norm": 0.673405463055557, "learning_rate": 8.191454121022513e-06, "loss": 0.0979, "step": 9081 }, { "epoch": 0.7070799474478128, "grad_norm": 0.5869475001284824, "learning_rate": 8.187425648597045e-06, "loss": 0.0648, "step": 9082 }, { "epoch": 0.7071578025400224, "grad_norm": 0.4987201438107702, "learning_rate": 8.183397912039442e-06, "loss": 0.0511, "step": 9083 }, { "epoch": 0.7072356576322321, "grad_norm": 0.6184642434960965, "learning_rate": 8.17937091160059e-06, "loss": 0.0758, "step": 9084 }, { "epoch": 0.7073135127244417, "grad_norm": 0.42028094215586764, "learning_rate": 8.175344647531379e-06, "loss": 0.0365, "step": 9085 }, { "epoch": 0.7073913678166512, "grad_norm": 0.5845002019938019, "learning_rate": 8.171319120082615e-06, "loss": 0.082, "step": 9086 }, { "epoch": 0.7074692229088608, "grad_norm": 0.5234215632408311, "learning_rate": 8.167294329505069e-06, "loss": 0.043, "step": 9087 }, { "epoch": 0.7075470780010705, "grad_norm": 0.5208277318452285, "learning_rate": 8.163270276049467e-06, "loss": 0.0579, "step": 9088 }, { "epoch": 0.7076249330932801, "grad_norm": 0.4091942989513499, "learning_rate": 8.159246959966482e-06, "loss": 0.0321, "step": 9089 }, { "epoch": 0.7077027881854897, "grad_norm": 0.527317395246141, "learning_rate": 8.155224381506763e-06, "loss": 0.045, "step": 9090 }, { "epoch": 0.7077806432776994, "grad_norm": 0.39665867537358623, "learning_rate": 8.151202540920891e-06, "loss": 0.0287, "step": 9091 }, { "epoch": 0.707858498369909, "grad_norm": 0.5294776235193889, "learning_rate": 8.14718143845941e-06, "loss": 0.0537, "step": 9092 }, { "epoch": 0.7079363534621186, "grad_norm": 0.5848126281251426, "learning_rate": 8.14316107437281e-06, "loss": 0.0741, "step": 9093 }, { "epoch": 0.7080142085543283, "grad_norm": 0.49801206696885564, "learning_rate": 8.139141448911538e-06, "loss": 0.0595, "step": 9094 }, { "epoch": 0.7080920636465379, "grad_norm": 0.6257536388729711, "learning_rate": 8.135122562325998e-06, "loss": 0.0819, "step": 9095 }, { "epoch": 0.7081699187387475, "grad_norm": 0.4572907255134475, "learning_rate": 8.13110441486656e-06, "loss": 0.0387, "step": 9096 }, { "epoch": 0.7082477738309572, "grad_norm": 0.38456614954678775, "learning_rate": 8.127087006783528e-06, "loss": 0.0275, "step": 9097 }, { "epoch": 0.7083256289231668, "grad_norm": 0.5474861161047452, "learning_rate": 8.123070338327168e-06, "loss": 0.0588, "step": 9098 }, { "epoch": 0.7084034840153763, "grad_norm": 0.5504534609667185, "learning_rate": 8.1190544097477e-06, "loss": 0.0648, "step": 9099 }, { "epoch": 0.708481339107586, "grad_norm": 0.570253370670762, "learning_rate": 8.115039221295288e-06, "loss": 0.0696, "step": 9100 }, { "epoch": 0.708481339107586, "eval_loss": 0.007155950181186199, "eval_runtime": 162.5845, "eval_samples_per_second": 17.714, "eval_steps_per_second": 0.634, "step": 9100 }, { "epoch": 0.7085591941997956, "grad_norm": 0.6060175610742848, "learning_rate": 8.111024773220084e-06, "loss": 0.059, "step": 9101 }, { "epoch": 0.7086370492920052, "grad_norm": 0.44579509386177757, "learning_rate": 8.107011065772146e-06, "loss": 0.0448, "step": 9102 }, { "epoch": 0.7087149043842149, "grad_norm": 0.5416725069546292, "learning_rate": 8.102998099201515e-06, "loss": 0.0479, "step": 9103 }, { "epoch": 0.7087927594764245, "grad_norm": 0.48177946125391696, "learning_rate": 8.098985873758182e-06, "loss": 0.0341, "step": 9104 }, { "epoch": 0.7088706145686341, "grad_norm": 0.4405567235523905, "learning_rate": 8.094974389692085e-06, "loss": 0.0424, "step": 9105 }, { "epoch": 0.7089484696608438, "grad_norm": 0.4956158564215585, "learning_rate": 8.09096364725313e-06, "loss": 0.0479, "step": 9106 }, { "epoch": 0.7090263247530534, "grad_norm": 0.5489206518595395, "learning_rate": 8.086953646691163e-06, "loss": 0.0586, "step": 9107 }, { "epoch": 0.709104179845263, "grad_norm": 0.5442087358527161, "learning_rate": 8.082944388255987e-06, "loss": 0.0613, "step": 9108 }, { "epoch": 0.7091820349374727, "grad_norm": 0.6018344151487796, "learning_rate": 8.07893587219736e-06, "loss": 0.0716, "step": 9109 }, { "epoch": 0.7092598900296823, "grad_norm": 0.5545983009570427, "learning_rate": 8.074928098764997e-06, "loss": 0.0594, "step": 9110 }, { "epoch": 0.7093377451218918, "grad_norm": 0.41753773043506104, "learning_rate": 8.070921068208556e-06, "loss": 0.0348, "step": 9111 }, { "epoch": 0.7094156002141015, "grad_norm": 0.561201704564609, "learning_rate": 8.066914780777663e-06, "loss": 0.0662, "step": 9112 }, { "epoch": 0.7094934553063111, "grad_norm": 0.4164121577373335, "learning_rate": 8.06290923672189e-06, "loss": 0.0337, "step": 9113 }, { "epoch": 0.7095713103985207, "grad_norm": 0.4411978047856019, "learning_rate": 8.058904436290761e-06, "loss": 0.0319, "step": 9114 }, { "epoch": 0.7096491654907304, "grad_norm": 0.5002414530917507, "learning_rate": 8.054900379733752e-06, "loss": 0.0461, "step": 9115 }, { "epoch": 0.70972702058294, "grad_norm": 0.5310705334297432, "learning_rate": 8.050897067300308e-06, "loss": 0.0553, "step": 9116 }, { "epoch": 0.7098048756751496, "grad_norm": 0.48647370773835635, "learning_rate": 8.04689449923981e-06, "loss": 0.0438, "step": 9117 }, { "epoch": 0.7098827307673593, "grad_norm": 0.4750749731777304, "learning_rate": 8.042892675801602e-06, "loss": 0.0574, "step": 9118 }, { "epoch": 0.7099605858595689, "grad_norm": 0.5225251148153763, "learning_rate": 8.038891597234976e-06, "loss": 0.062, "step": 9119 }, { "epoch": 0.7100384409517785, "grad_norm": 0.4855717069850395, "learning_rate": 8.034891263789182e-06, "loss": 0.0407, "step": 9120 }, { "epoch": 0.7101162960439882, "grad_norm": 0.42806794292852113, "learning_rate": 8.030891675713422e-06, "loss": 0.0368, "step": 9121 }, { "epoch": 0.7101941511361978, "grad_norm": 0.38755059661671615, "learning_rate": 8.026892833256848e-06, "loss": 0.0276, "step": 9122 }, { "epoch": 0.7102720062284074, "grad_norm": 0.577141485614417, "learning_rate": 8.022894736668574e-06, "loss": 0.0637, "step": 9123 }, { "epoch": 0.710349861320617, "grad_norm": 0.5029876880436256, "learning_rate": 8.01889738619766e-06, "loss": 0.0607, "step": 9124 }, { "epoch": 0.7104277164128266, "grad_norm": 0.5100346603110342, "learning_rate": 8.014900782093115e-06, "loss": 0.0517, "step": 9125 }, { "epoch": 0.7105055715050362, "grad_norm": 0.584434972930859, "learning_rate": 8.010904924603925e-06, "loss": 0.0551, "step": 9126 }, { "epoch": 0.7105834265972458, "grad_norm": 0.3964228901474849, "learning_rate": 8.006909813979005e-06, "loss": 0.0376, "step": 9127 }, { "epoch": 0.7106612816894555, "grad_norm": 0.542794089273378, "learning_rate": 8.002915450467228e-06, "loss": 0.0688, "step": 9128 }, { "epoch": 0.7107391367816651, "grad_norm": 0.4248880684352762, "learning_rate": 7.99892183431743e-06, "loss": 0.0403, "step": 9129 }, { "epoch": 0.7108169918738747, "grad_norm": 0.5152522631105252, "learning_rate": 7.99492896577839e-06, "loss": 0.053, "step": 9130 }, { "epoch": 0.7108948469660844, "grad_norm": 0.5437812235178064, "learning_rate": 7.990936845098846e-06, "loss": 0.0586, "step": 9131 }, { "epoch": 0.710972702058294, "grad_norm": 0.43009326180482055, "learning_rate": 7.986945472527492e-06, "loss": 0.0388, "step": 9132 }, { "epoch": 0.7110505571505036, "grad_norm": 0.5051853265878484, "learning_rate": 7.982954848312967e-06, "loss": 0.0542, "step": 9133 }, { "epoch": 0.7111284122427133, "grad_norm": 0.48141117192043426, "learning_rate": 7.978964972703869e-06, "loss": 0.0608, "step": 9134 }, { "epoch": 0.7112062673349229, "grad_norm": 0.43665853450827774, "learning_rate": 7.974975845948749e-06, "loss": 0.0438, "step": 9135 }, { "epoch": 0.7112841224271325, "grad_norm": 0.4946915265372431, "learning_rate": 7.970987468296107e-06, "loss": 0.0487, "step": 9136 }, { "epoch": 0.7113619775193422, "grad_norm": 0.5793668569956447, "learning_rate": 7.966999839994409e-06, "loss": 0.0771, "step": 9137 }, { "epoch": 0.7114398326115517, "grad_norm": 0.4348304359251514, "learning_rate": 7.963012961292062e-06, "loss": 0.0367, "step": 9138 }, { "epoch": 0.7115176877037613, "grad_norm": 0.4189217465615059, "learning_rate": 7.959026832437428e-06, "loss": 0.0362, "step": 9139 }, { "epoch": 0.711595542795971, "grad_norm": 0.5621484707697331, "learning_rate": 7.955041453678825e-06, "loss": 0.0522, "step": 9140 }, { "epoch": 0.7116733978881806, "grad_norm": 0.49582277696837784, "learning_rate": 7.951056825264523e-06, "loss": 0.0403, "step": 9141 }, { "epoch": 0.7117512529803902, "grad_norm": 0.4442035218544608, "learning_rate": 7.947072947442747e-06, "loss": 0.0471, "step": 9142 }, { "epoch": 0.7118291080725999, "grad_norm": 0.5097278442034654, "learning_rate": 7.94308982046167e-06, "loss": 0.0468, "step": 9143 }, { "epoch": 0.7119069631648095, "grad_norm": 0.5669609321926692, "learning_rate": 7.939107444569427e-06, "loss": 0.0567, "step": 9144 }, { "epoch": 0.7119848182570191, "grad_norm": 0.5524233390214376, "learning_rate": 7.935125820014099e-06, "loss": 0.0543, "step": 9145 }, { "epoch": 0.7120626733492288, "grad_norm": 0.47626535647810136, "learning_rate": 7.931144947043714e-06, "loss": 0.0504, "step": 9146 }, { "epoch": 0.7121405284414384, "grad_norm": 0.5326921116468429, "learning_rate": 7.927164825906278e-06, "loss": 0.0524, "step": 9147 }, { "epoch": 0.712218383533648, "grad_norm": 0.5673697623201706, "learning_rate": 7.923185456849725e-06, "loss": 0.0631, "step": 9148 }, { "epoch": 0.7122962386258577, "grad_norm": 0.45805117940427487, "learning_rate": 7.919206840121953e-06, "loss": 0.0478, "step": 9149 }, { "epoch": 0.7123740937180673, "grad_norm": 0.5710073902167104, "learning_rate": 7.915228975970816e-06, "loss": 0.0732, "step": 9150 }, { "epoch": 0.7123740937180673, "eval_loss": 0.0070964558981359005, "eval_runtime": 162.0862, "eval_samples_per_second": 17.768, "eval_steps_per_second": 0.635, "step": 9150 }, { "epoch": 0.7124519488102768, "grad_norm": 0.5186301159800483, "learning_rate": 7.911251864644098e-06, "loss": 0.0399, "step": 9151 }, { "epoch": 0.7125298039024865, "grad_norm": 0.4358433502320046, "learning_rate": 7.907275506389574e-06, "loss": 0.0391, "step": 9152 }, { "epoch": 0.7126076589946961, "grad_norm": 0.45968645287449306, "learning_rate": 7.903299901454945e-06, "loss": 0.0365, "step": 9153 }, { "epoch": 0.7126855140869057, "grad_norm": 0.49370836987107475, "learning_rate": 7.89932505008787e-06, "loss": 0.0352, "step": 9154 }, { "epoch": 0.7127633691791154, "grad_norm": 0.5643254997331597, "learning_rate": 7.895350952535967e-06, "loss": 0.0679, "step": 9155 }, { "epoch": 0.712841224271325, "grad_norm": 0.5816165653628028, "learning_rate": 7.891377609046797e-06, "loss": 0.0807, "step": 9156 }, { "epoch": 0.7129190793635346, "grad_norm": 0.3689465414447235, "learning_rate": 7.887405019867893e-06, "loss": 0.0283, "step": 9157 }, { "epoch": 0.7129969344557442, "grad_norm": 0.6208471305036737, "learning_rate": 7.883433185246722e-06, "loss": 0.0852, "step": 9158 }, { "epoch": 0.7130747895479539, "grad_norm": 0.4692172111740893, "learning_rate": 7.879462105430717e-06, "loss": 0.0476, "step": 9159 }, { "epoch": 0.7131526446401635, "grad_norm": 0.5746299556102434, "learning_rate": 7.875491780667246e-06, "loss": 0.0581, "step": 9160 }, { "epoch": 0.7132304997323731, "grad_norm": 0.47583826235889026, "learning_rate": 7.87152221120364e-06, "loss": 0.0498, "step": 9161 }, { "epoch": 0.7133083548245828, "grad_norm": 0.47751032505268737, "learning_rate": 7.867553397287198e-06, "loss": 0.0429, "step": 9162 }, { "epoch": 0.7133862099167924, "grad_norm": 0.4939785023305799, "learning_rate": 7.86358533916515e-06, "loss": 0.0491, "step": 9163 }, { "epoch": 0.713464065009002, "grad_norm": 0.39508012306416607, "learning_rate": 7.85961803708469e-06, "loss": 0.0344, "step": 9164 }, { "epoch": 0.7135419201012116, "grad_norm": 0.43570217464775046, "learning_rate": 7.855651491292961e-06, "loss": 0.0411, "step": 9165 }, { "epoch": 0.7136197751934212, "grad_norm": 0.4600297599243362, "learning_rate": 7.851685702037055e-06, "loss": 0.0427, "step": 9166 }, { "epoch": 0.7136976302856308, "grad_norm": 0.6067718315373165, "learning_rate": 7.847720669564035e-06, "loss": 0.0826, "step": 9167 }, { "epoch": 0.7137754853778405, "grad_norm": 0.4391435836677888, "learning_rate": 7.843756394120902e-06, "loss": 0.0357, "step": 9168 }, { "epoch": 0.7138533404700501, "grad_norm": 0.46565462263437074, "learning_rate": 7.8397928759546e-06, "loss": 0.0353, "step": 9169 }, { "epoch": 0.7139311955622597, "grad_norm": 0.44146052283199916, "learning_rate": 7.835830115312044e-06, "loss": 0.0483, "step": 9170 }, { "epoch": 0.7140090506544694, "grad_norm": 0.5393801782830943, "learning_rate": 7.831868112440093e-06, "loss": 0.0581, "step": 9171 }, { "epoch": 0.714086905746679, "grad_norm": 0.4899271841886713, "learning_rate": 7.82790686758556e-06, "loss": 0.0503, "step": 9172 }, { "epoch": 0.7141647608388886, "grad_norm": 0.4663189004268656, "learning_rate": 7.82394638099522e-06, "loss": 0.0382, "step": 9173 }, { "epoch": 0.7142426159310983, "grad_norm": 0.5303779324537355, "learning_rate": 7.819986652915787e-06, "loss": 0.062, "step": 9174 }, { "epoch": 0.7143204710233079, "grad_norm": 0.627217611957337, "learning_rate": 7.816027683593937e-06, "loss": 0.071, "step": 9175 }, { "epoch": 0.7143983261155175, "grad_norm": 0.5063551046690321, "learning_rate": 7.812069473276291e-06, "loss": 0.0591, "step": 9176 }, { "epoch": 0.7144761812077272, "grad_norm": 0.47744144713827663, "learning_rate": 7.80811202220943e-06, "loss": 0.0428, "step": 9177 }, { "epoch": 0.7145540362999367, "grad_norm": 0.4804860324773423, "learning_rate": 7.80415533063988e-06, "loss": 0.0472, "step": 9178 }, { "epoch": 0.7146318913921463, "grad_norm": 0.46147515514788323, "learning_rate": 7.80019939881413e-06, "loss": 0.0411, "step": 9179 }, { "epoch": 0.714709746484356, "grad_norm": 0.5017369997417741, "learning_rate": 7.796244226978611e-06, "loss": 0.0573, "step": 9180 }, { "epoch": 0.7147876015765656, "grad_norm": 0.6147564290220332, "learning_rate": 7.792289815379717e-06, "loss": 0.0649, "step": 9181 }, { "epoch": 0.7148654566687752, "grad_norm": 0.5298067335907939, "learning_rate": 7.788336164263776e-06, "loss": 0.0502, "step": 9182 }, { "epoch": 0.7149433117609849, "grad_norm": 0.4678881660718016, "learning_rate": 7.7843832738771e-06, "loss": 0.0507, "step": 9183 }, { "epoch": 0.7150211668531945, "grad_norm": 0.5582272730935549, "learning_rate": 7.78043114446593e-06, "loss": 0.0658, "step": 9184 }, { "epoch": 0.7150990219454041, "grad_norm": 0.5281107328561353, "learning_rate": 7.776479776276462e-06, "loss": 0.0607, "step": 9185 }, { "epoch": 0.7151768770376138, "grad_norm": 0.4852106132398036, "learning_rate": 7.772529169554845e-06, "loss": 0.046, "step": 9186 }, { "epoch": 0.7152547321298234, "grad_norm": 0.5029535419942, "learning_rate": 7.76857932454719e-06, "loss": 0.0457, "step": 9187 }, { "epoch": 0.715332587222033, "grad_norm": 0.48485691723706065, "learning_rate": 7.76463024149955e-06, "loss": 0.0542, "step": 9188 }, { "epoch": 0.7154104423142427, "grad_norm": 0.4539218271762529, "learning_rate": 7.760681920657933e-06, "loss": 0.0447, "step": 9189 }, { "epoch": 0.7154882974064523, "grad_norm": 0.48453383008374973, "learning_rate": 7.756734362268303e-06, "loss": 0.0433, "step": 9190 }, { "epoch": 0.7155661524986618, "grad_norm": 0.5031197527257862, "learning_rate": 7.752787566576573e-06, "loss": 0.0562, "step": 9191 }, { "epoch": 0.7156440075908715, "grad_norm": 0.4969971869123003, "learning_rate": 7.748841533828607e-06, "loss": 0.0436, "step": 9192 }, { "epoch": 0.7157218626830811, "grad_norm": 0.5324861786003598, "learning_rate": 7.744896264270232e-06, "loss": 0.0567, "step": 9193 }, { "epoch": 0.7157997177752907, "grad_norm": 0.3938930302023522, "learning_rate": 7.740951758147218e-06, "loss": 0.0277, "step": 9194 }, { "epoch": 0.7158775728675003, "grad_norm": 0.5154261705475307, "learning_rate": 7.737008015705283e-06, "loss": 0.0543, "step": 9195 }, { "epoch": 0.71595542795971, "grad_norm": 0.5561513699186321, "learning_rate": 7.733065037190111e-06, "loss": 0.0665, "step": 9196 }, { "epoch": 0.7160332830519196, "grad_norm": 0.5716592433918406, "learning_rate": 7.729122822847326e-06, "loss": 0.0563, "step": 9197 }, { "epoch": 0.7161111381441292, "grad_norm": 0.6149020392465607, "learning_rate": 7.72518137292251e-06, "loss": 0.0672, "step": 9198 }, { "epoch": 0.7161889932363389, "grad_norm": 0.49214382097670506, "learning_rate": 7.721240687661197e-06, "loss": 0.0455, "step": 9199 }, { "epoch": 0.7162668483285485, "grad_norm": 0.4513006312898022, "learning_rate": 7.717300767308878e-06, "loss": 0.0542, "step": 9200 }, { "epoch": 0.7162668483285485, "eval_loss": 0.007054983172565699, "eval_runtime": 167.3555, "eval_samples_per_second": 17.209, "eval_steps_per_second": 0.615, "step": 9200 }, { "epoch": 0.7163447034207581, "grad_norm": 0.5403814789779724, "learning_rate": 7.713361612110983e-06, "loss": 0.0634, "step": 9201 }, { "epoch": 0.7164225585129678, "grad_norm": 0.4290706943849592, "learning_rate": 7.709423222312903e-06, "loss": 0.0353, "step": 9202 }, { "epoch": 0.7165004136051774, "grad_norm": 0.4401098459006935, "learning_rate": 7.705485598159992e-06, "loss": 0.0348, "step": 9203 }, { "epoch": 0.7165782686973869, "grad_norm": 0.5111722958071883, "learning_rate": 7.70154873989754e-06, "loss": 0.0532, "step": 9204 }, { "epoch": 0.7166561237895966, "grad_norm": 0.37430133578605596, "learning_rate": 7.697612647770791e-06, "loss": 0.0341, "step": 9205 }, { "epoch": 0.7167339788818062, "grad_norm": 0.5805854515129734, "learning_rate": 7.693677322024948e-06, "loss": 0.0636, "step": 9206 }, { "epoch": 0.7168118339740158, "grad_norm": 0.5735908727033157, "learning_rate": 7.689742762905166e-06, "loss": 0.0653, "step": 9207 }, { "epoch": 0.7168896890662255, "grad_norm": 0.5762669621064576, "learning_rate": 7.685808970656542e-06, "loss": 0.0699, "step": 9208 }, { "epoch": 0.7169675441584351, "grad_norm": 0.417633966954164, "learning_rate": 7.681875945524143e-06, "loss": 0.0366, "step": 9209 }, { "epoch": 0.7170453992506447, "grad_norm": 0.5343738044865641, "learning_rate": 7.67794368775297e-06, "loss": 0.0533, "step": 9210 }, { "epoch": 0.7171232543428544, "grad_norm": 0.575064540394653, "learning_rate": 7.674012197587983e-06, "loss": 0.0814, "step": 9211 }, { "epoch": 0.717201109435064, "grad_norm": 0.5365354453991311, "learning_rate": 7.670081475274103e-06, "loss": 0.0715, "step": 9212 }, { "epoch": 0.7172789645272736, "grad_norm": 0.5125441902890578, "learning_rate": 7.666151521056182e-06, "loss": 0.055, "step": 9213 }, { "epoch": 0.7173568196194833, "grad_norm": 0.4580149439288903, "learning_rate": 7.662222335179055e-06, "loss": 0.0485, "step": 9214 }, { "epoch": 0.7174346747116929, "grad_norm": 0.3918906991960972, "learning_rate": 7.658293917887482e-06, "loss": 0.0303, "step": 9215 }, { "epoch": 0.7175125298039025, "grad_norm": 0.6267223796222834, "learning_rate": 7.654366269426189e-06, "loss": 0.0809, "step": 9216 }, { "epoch": 0.7175903848961122, "grad_norm": 0.63737797388151, "learning_rate": 7.650439390039857e-06, "loss": 0.0679, "step": 9217 }, { "epoch": 0.7176682399883217, "grad_norm": 0.5210194041225152, "learning_rate": 7.646513279973084e-06, "loss": 0.0502, "step": 9218 }, { "epoch": 0.7177460950805313, "grad_norm": 0.5404140250523206, "learning_rate": 7.642587939470478e-06, "loss": 0.0496, "step": 9219 }, { "epoch": 0.717823950172741, "grad_norm": 0.4735623050356026, "learning_rate": 7.638663368776555e-06, "loss": 0.0549, "step": 9220 }, { "epoch": 0.7179018052649506, "grad_norm": 0.5811134193080182, "learning_rate": 7.634739568135803e-06, "loss": 0.0682, "step": 9221 }, { "epoch": 0.7179796603571602, "grad_norm": 0.6547497091718121, "learning_rate": 7.630816537792653e-06, "loss": 0.0815, "step": 9222 }, { "epoch": 0.7180575154493699, "grad_norm": 0.5039488254075469, "learning_rate": 7.626894277991488e-06, "loss": 0.0645, "step": 9223 }, { "epoch": 0.7181353705415795, "grad_norm": 0.48468388207657503, "learning_rate": 7.622972788976655e-06, "loss": 0.0421, "step": 9224 }, { "epoch": 0.7182132256337891, "grad_norm": 0.5377471044375508, "learning_rate": 7.619052070992441e-06, "loss": 0.0743, "step": 9225 }, { "epoch": 0.7182910807259988, "grad_norm": 0.4882016769344513, "learning_rate": 7.615132124283093e-06, "loss": 0.0544, "step": 9226 }, { "epoch": 0.7183689358182084, "grad_norm": 0.5276463420491008, "learning_rate": 7.6112129490927945e-06, "loss": 0.0353, "step": 9227 }, { "epoch": 0.718446790910418, "grad_norm": 0.6236276387751996, "learning_rate": 7.607294545665689e-06, "loss": 0.0793, "step": 9228 }, { "epoch": 0.7185246460026276, "grad_norm": 0.3947351549204248, "learning_rate": 7.60337691424589e-06, "loss": 0.0357, "step": 9229 }, { "epoch": 0.7186025010948373, "grad_norm": 0.41392993078961643, "learning_rate": 7.599460055077439e-06, "loss": 0.0359, "step": 9230 }, { "epoch": 0.7186803561870468, "grad_norm": 0.6115038896876108, "learning_rate": 7.5955439684043394e-06, "loss": 0.0988, "step": 9231 }, { "epoch": 0.7187582112792564, "grad_norm": 0.4785837999172026, "learning_rate": 7.591628654470544e-06, "loss": 0.0576, "step": 9232 }, { "epoch": 0.7188360663714661, "grad_norm": 0.4115504846913637, "learning_rate": 7.587714113519953e-06, "loss": 0.0211, "step": 9233 }, { "epoch": 0.7189139214636757, "grad_norm": 0.5526811654863186, "learning_rate": 7.583800345796437e-06, "loss": 0.0769, "step": 9234 }, { "epoch": 0.7189917765558853, "grad_norm": 0.45315547142116686, "learning_rate": 7.579887351543804e-06, "loss": 0.0498, "step": 9235 }, { "epoch": 0.719069631648095, "grad_norm": 0.55512654565347, "learning_rate": 7.5759751310058035e-06, "loss": 0.0742, "step": 9236 }, { "epoch": 0.7191474867403046, "grad_norm": 0.5310446706291794, "learning_rate": 7.572063684426152e-06, "loss": 0.061, "step": 9237 }, { "epoch": 0.7192253418325142, "grad_norm": 0.4170623310701445, "learning_rate": 7.568153012048511e-06, "loss": 0.0472, "step": 9238 }, { "epoch": 0.7193031969247239, "grad_norm": 0.47761247535620316, "learning_rate": 7.564243114116509e-06, "loss": 0.0508, "step": 9239 }, { "epoch": 0.7193810520169335, "grad_norm": 0.5383973385095776, "learning_rate": 7.5603339908737095e-06, "loss": 0.0653, "step": 9240 }, { "epoch": 0.7194589071091431, "grad_norm": 0.5438081588875947, "learning_rate": 7.55642564256363e-06, "loss": 0.0717, "step": 9241 }, { "epoch": 0.7195367622013528, "grad_norm": 0.5972415145786638, "learning_rate": 7.552518069429742e-06, "loss": 0.0468, "step": 9242 }, { "epoch": 0.7196146172935624, "grad_norm": 0.5302957137676537, "learning_rate": 7.548611271715465e-06, "loss": 0.0497, "step": 9243 }, { "epoch": 0.7196924723857719, "grad_norm": 0.33735841054197846, "learning_rate": 7.5447052496641905e-06, "loss": 0.0233, "step": 9244 }, { "epoch": 0.7197703274779816, "grad_norm": 0.48301323387801387, "learning_rate": 7.540800003519229e-06, "loss": 0.0506, "step": 9245 }, { "epoch": 0.7198481825701912, "grad_norm": 0.548176446823257, "learning_rate": 7.536895533523862e-06, "loss": 0.0582, "step": 9246 }, { "epoch": 0.7199260376624008, "grad_norm": 0.5016849323766116, "learning_rate": 7.532991839921322e-06, "loss": 0.0546, "step": 9247 }, { "epoch": 0.7200038927546105, "grad_norm": 0.491925648506523, "learning_rate": 7.529088922954791e-06, "loss": 0.0421, "step": 9248 }, { "epoch": 0.7200817478468201, "grad_norm": 0.5561093668283379, "learning_rate": 7.525186782867393e-06, "loss": 0.054, "step": 9249 }, { "epoch": 0.7201596029390297, "grad_norm": 0.5159282814539197, "learning_rate": 7.521285419902231e-06, "loss": 0.056, "step": 9250 }, { "epoch": 0.7201596029390297, "eval_loss": 0.0069982572458684444, "eval_runtime": 166.8792, "eval_samples_per_second": 17.258, "eval_steps_per_second": 0.617, "step": 9250 }, { "epoch": 0.7202374580312394, "grad_norm": 0.47469917589601707, "learning_rate": 7.517384834302331e-06, "loss": 0.0457, "step": 9251 }, { "epoch": 0.720315313123449, "grad_norm": 0.535401232505702, "learning_rate": 7.513485026310683e-06, "loss": 0.0533, "step": 9252 }, { "epoch": 0.7203931682156586, "grad_norm": 0.5690938223865902, "learning_rate": 7.509585996170225e-06, "loss": 0.0613, "step": 9253 }, { "epoch": 0.7204710233078683, "grad_norm": 0.5141029313426597, "learning_rate": 7.505687744123851e-06, "loss": 0.0614, "step": 9254 }, { "epoch": 0.7205488784000779, "grad_norm": 0.41186168083792435, "learning_rate": 7.501790270414399e-06, "loss": 0.0328, "step": 9255 }, { "epoch": 0.7206267334922875, "grad_norm": 0.5622585245244817, "learning_rate": 7.497893575284669e-06, "loss": 0.0745, "step": 9256 }, { "epoch": 0.7207045885844972, "grad_norm": 0.5097190378568384, "learning_rate": 7.4939976589774035e-06, "loss": 0.0543, "step": 9257 }, { "epoch": 0.7207824436767067, "grad_norm": 0.4436647308396412, "learning_rate": 7.490102521735301e-06, "loss": 0.0428, "step": 9258 }, { "epoch": 0.7208602987689163, "grad_norm": 0.4817276880955104, "learning_rate": 7.486208163801001e-06, "loss": 0.0435, "step": 9259 }, { "epoch": 0.720938153861126, "grad_norm": 0.5028407501122342, "learning_rate": 7.482314585417121e-06, "loss": 0.0547, "step": 9260 }, { "epoch": 0.7210160089533356, "grad_norm": 0.4632351357111956, "learning_rate": 7.478421786826202e-06, "loss": 0.0286, "step": 9261 }, { "epoch": 0.7210938640455452, "grad_norm": 0.49545972258650417, "learning_rate": 7.4745297682707504e-06, "loss": 0.0441, "step": 9262 }, { "epoch": 0.7211717191377549, "grad_norm": 0.4894432424531371, "learning_rate": 7.470638529993222e-06, "loss": 0.0531, "step": 9263 }, { "epoch": 0.7212495742299645, "grad_norm": 0.3808273786934861, "learning_rate": 7.466748072236016e-06, "loss": 0.0301, "step": 9264 }, { "epoch": 0.7213274293221741, "grad_norm": 0.5519244854574099, "learning_rate": 7.462858395241494e-06, "loss": 0.063, "step": 9265 }, { "epoch": 0.7214052844143837, "grad_norm": 0.4969990879538833, "learning_rate": 7.4589694992519665e-06, "loss": 0.0485, "step": 9266 }, { "epoch": 0.7214831395065934, "grad_norm": 0.4567075358808547, "learning_rate": 7.4550813845096904e-06, "loss": 0.0436, "step": 9267 }, { "epoch": 0.721560994598803, "grad_norm": 0.5065120745939296, "learning_rate": 7.451194051256876e-06, "loss": 0.0554, "step": 9268 }, { "epoch": 0.7216388496910126, "grad_norm": 0.4734068340571609, "learning_rate": 7.447307499735683e-06, "loss": 0.05, "step": 9269 }, { "epoch": 0.7217167047832223, "grad_norm": 0.481927029236051, "learning_rate": 7.443421730188234e-06, "loss": 0.044, "step": 9270 }, { "epoch": 0.7217945598754318, "grad_norm": 0.5285841063098071, "learning_rate": 7.439536742856593e-06, "loss": 0.0691, "step": 9271 }, { "epoch": 0.7218724149676414, "grad_norm": 0.5061825902133248, "learning_rate": 7.4356525379827716e-06, "loss": 0.0621, "step": 9272 }, { "epoch": 0.7219502700598511, "grad_norm": 0.41961003345008346, "learning_rate": 7.43176911580874e-06, "loss": 0.0445, "step": 9273 }, { "epoch": 0.7220281251520607, "grad_norm": 0.6624503774719523, "learning_rate": 7.427886476576418e-06, "loss": 0.0645, "step": 9274 }, { "epoch": 0.7221059802442703, "grad_norm": 0.5106877204473383, "learning_rate": 7.424004620527672e-06, "loss": 0.0637, "step": 9275 }, { "epoch": 0.72218383533648, "grad_norm": 0.47384301687345076, "learning_rate": 7.420123547904327e-06, "loss": 0.0465, "step": 9276 }, { "epoch": 0.7222616904286896, "grad_norm": 0.5781370919722573, "learning_rate": 7.4162432589481545e-06, "loss": 0.0586, "step": 9277 }, { "epoch": 0.7223395455208992, "grad_norm": 0.4494415540701983, "learning_rate": 7.412363753900878e-06, "loss": 0.0377, "step": 9278 }, { "epoch": 0.7224174006131089, "grad_norm": 0.43739106491018315, "learning_rate": 7.4084850330041646e-06, "loss": 0.0464, "step": 9279 }, { "epoch": 0.7224952557053185, "grad_norm": 0.4804190729468693, "learning_rate": 7.404607096499656e-06, "loss": 0.0431, "step": 9280 }, { "epoch": 0.7225731107975281, "grad_norm": 0.47084827117097316, "learning_rate": 7.4007299446289215e-06, "loss": 0.0308, "step": 9281 }, { "epoch": 0.7226509658897378, "grad_norm": 0.45918770688353044, "learning_rate": 7.396853577633489e-06, "loss": 0.0321, "step": 9282 }, { "epoch": 0.7227288209819474, "grad_norm": 0.5164979510226531, "learning_rate": 7.3929779957548394e-06, "loss": 0.0516, "step": 9283 }, { "epoch": 0.7228066760741569, "grad_norm": 0.4700004707461437, "learning_rate": 7.389103199234411e-06, "loss": 0.0494, "step": 9284 }, { "epoch": 0.7228845311663666, "grad_norm": 0.5270326561658825, "learning_rate": 7.385229188313561e-06, "loss": 0.0468, "step": 9285 }, { "epoch": 0.7229623862585762, "grad_norm": 0.48690319309043995, "learning_rate": 7.381355963233645e-06, "loss": 0.04, "step": 9286 }, { "epoch": 0.7230402413507858, "grad_norm": 0.5146718967526681, "learning_rate": 7.37748352423594e-06, "loss": 0.0593, "step": 9287 }, { "epoch": 0.7231180964429955, "grad_norm": 0.524088098350225, "learning_rate": 7.373611871561679e-06, "loss": 0.052, "step": 9288 }, { "epoch": 0.7231959515352051, "grad_norm": 0.5208256222450628, "learning_rate": 7.369741005452052e-06, "loss": 0.0638, "step": 9289 }, { "epoch": 0.7232738066274147, "grad_norm": 0.5688006989234901, "learning_rate": 7.365870926148184e-06, "loss": 0.0591, "step": 9290 }, { "epoch": 0.7233516617196244, "grad_norm": 0.5238560986370254, "learning_rate": 7.362001633891178e-06, "loss": 0.0576, "step": 9291 }, { "epoch": 0.723429516811834, "grad_norm": 0.4756494828945094, "learning_rate": 7.358133128922067e-06, "loss": 0.0525, "step": 9292 }, { "epoch": 0.7235073719040436, "grad_norm": 0.45813505108831154, "learning_rate": 7.354265411481847e-06, "loss": 0.0487, "step": 9293 }, { "epoch": 0.7235852269962533, "grad_norm": 0.4542891233681885, "learning_rate": 7.350398481811445e-06, "loss": 0.0515, "step": 9294 }, { "epoch": 0.7236630820884629, "grad_norm": 0.40369119066583364, "learning_rate": 7.346532340151753e-06, "loss": 0.0325, "step": 9295 }, { "epoch": 0.7237409371806725, "grad_norm": 0.49648124077333017, "learning_rate": 7.342666986743625e-06, "loss": 0.0516, "step": 9296 }, { "epoch": 0.7238187922728821, "grad_norm": 0.4267437528824889, "learning_rate": 7.338802421827851e-06, "loss": 0.034, "step": 9297 }, { "epoch": 0.7238966473650917, "grad_norm": 0.427582594794782, "learning_rate": 7.334938645645171e-06, "loss": 0.042, "step": 9298 }, { "epoch": 0.7239745024573013, "grad_norm": 0.45303949060931387, "learning_rate": 7.331075658436284e-06, "loss": 0.0408, "step": 9299 }, { "epoch": 0.724052357549511, "grad_norm": 0.5355684394444952, "learning_rate": 7.327213460441827e-06, "loss": 0.0609, "step": 9300 }, { "epoch": 0.724052357549511, "eval_loss": 0.006943481974303722, "eval_runtime": 167.103, "eval_samples_per_second": 17.235, "eval_steps_per_second": 0.616, "step": 9300 }, { "epoch": 0.7241302126417206, "grad_norm": 0.4367354846388249, "learning_rate": 7.32335205190241e-06, "loss": 0.0423, "step": 9301 }, { "epoch": 0.7242080677339302, "grad_norm": 0.5393693220099157, "learning_rate": 7.319491433058583e-06, "loss": 0.0608, "step": 9302 }, { "epoch": 0.7242859228261398, "grad_norm": 0.4302167579572659, "learning_rate": 7.315631604150828e-06, "loss": 0.0423, "step": 9303 }, { "epoch": 0.7243637779183495, "grad_norm": 0.5056090157889294, "learning_rate": 7.3117725654196035e-06, "loss": 0.0429, "step": 9304 }, { "epoch": 0.7244416330105591, "grad_norm": 0.587053557224121, "learning_rate": 7.3079143171053e-06, "loss": 0.0763, "step": 9305 }, { "epoch": 0.7245194881027687, "grad_norm": 0.46716697841572435, "learning_rate": 7.304056859448283e-06, "loss": 0.0539, "step": 9306 }, { "epoch": 0.7245973431949784, "grad_norm": 0.442964538369221, "learning_rate": 7.3002001926888485e-06, "loss": 0.0361, "step": 9307 }, { "epoch": 0.724675198287188, "grad_norm": 0.5805853935036217, "learning_rate": 7.296344317067247e-06, "loss": 0.0812, "step": 9308 }, { "epoch": 0.7247530533793975, "grad_norm": 0.4719542615815587, "learning_rate": 7.292489232823683e-06, "loss": 0.0393, "step": 9309 }, { "epoch": 0.7248309084716072, "grad_norm": 0.44981807937847595, "learning_rate": 7.288634940198302e-06, "loss": 0.0396, "step": 9310 }, { "epoch": 0.7249087635638168, "grad_norm": 0.6028562938081099, "learning_rate": 7.28478143943123e-06, "loss": 0.0677, "step": 9311 }, { "epoch": 0.7249866186560264, "grad_norm": 0.4956235932806421, "learning_rate": 7.2809287307625e-06, "loss": 0.0546, "step": 9312 }, { "epoch": 0.7250644737482361, "grad_norm": 0.46132957103075556, "learning_rate": 7.277076814432125e-06, "loss": 0.0389, "step": 9313 }, { "epoch": 0.7251423288404457, "grad_norm": 0.4288019772291317, "learning_rate": 7.2732256906800615e-06, "loss": 0.0392, "step": 9314 }, { "epoch": 0.7252201839326553, "grad_norm": 0.4312319745980202, "learning_rate": 7.269375359746209e-06, "loss": 0.0427, "step": 9315 }, { "epoch": 0.725298039024865, "grad_norm": 0.5338721997022869, "learning_rate": 7.26552582187044e-06, "loss": 0.0467, "step": 9316 }, { "epoch": 0.7253758941170746, "grad_norm": 0.5152657914026129, "learning_rate": 7.261677077292555e-06, "loss": 0.0618, "step": 9317 }, { "epoch": 0.7254537492092842, "grad_norm": 0.5376979312646569, "learning_rate": 7.257829126252312e-06, "loss": 0.0606, "step": 9318 }, { "epoch": 0.7255316043014939, "grad_norm": 0.46815887438964177, "learning_rate": 7.253981968989421e-06, "loss": 0.0487, "step": 9319 }, { "epoch": 0.7256094593937035, "grad_norm": 0.600658621707349, "learning_rate": 7.250135605743542e-06, "loss": 0.0724, "step": 9320 }, { "epoch": 0.7256873144859131, "grad_norm": 0.567032465270841, "learning_rate": 7.246290036754284e-06, "loss": 0.072, "step": 9321 }, { "epoch": 0.7257651695781228, "grad_norm": 0.3982757531142553, "learning_rate": 7.242445262261211e-06, "loss": 0.0394, "step": 9322 }, { "epoch": 0.7258430246703323, "grad_norm": 0.5466102447435456, "learning_rate": 7.238601282503832e-06, "loss": 0.065, "step": 9323 }, { "epoch": 0.7259208797625419, "grad_norm": 0.5445965185879508, "learning_rate": 7.234758097721608e-06, "loss": 0.0538, "step": 9324 }, { "epoch": 0.7259987348547516, "grad_norm": 0.502072984880783, "learning_rate": 7.230915708153954e-06, "loss": 0.058, "step": 9325 }, { "epoch": 0.7260765899469612, "grad_norm": 0.49027665297185136, "learning_rate": 7.2270741140402245e-06, "loss": 0.055, "step": 9326 }, { "epoch": 0.7261544450391708, "grad_norm": 0.6524309872714514, "learning_rate": 7.223233315619744e-06, "loss": 0.099, "step": 9327 }, { "epoch": 0.7262323001313805, "grad_norm": 0.5160348920377864, "learning_rate": 7.219393313131775e-06, "loss": 0.0441, "step": 9328 }, { "epoch": 0.7263101552235901, "grad_norm": 0.5358607582303335, "learning_rate": 7.215554106815528e-06, "loss": 0.0706, "step": 9329 }, { "epoch": 0.7263880103157997, "grad_norm": 0.4852719180051832, "learning_rate": 7.211715696910167e-06, "loss": 0.0462, "step": 9330 }, { "epoch": 0.7264658654080094, "grad_norm": 0.5317914114505646, "learning_rate": 7.207878083654807e-06, "loss": 0.0647, "step": 9331 }, { "epoch": 0.726543720500219, "grad_norm": 0.44088131907076755, "learning_rate": 7.204041267288515e-06, "loss": 0.0371, "step": 9332 }, { "epoch": 0.7266215755924286, "grad_norm": 0.4936234829732401, "learning_rate": 7.200205248050305e-06, "loss": 0.0591, "step": 9333 }, { "epoch": 0.7266994306846383, "grad_norm": 0.45580414182753337, "learning_rate": 7.196370026179145e-06, "loss": 0.0369, "step": 9334 }, { "epoch": 0.7267772857768479, "grad_norm": 0.512055290524529, "learning_rate": 7.192535601913948e-06, "loss": 0.0553, "step": 9335 }, { "epoch": 0.7268551408690574, "grad_norm": 0.4433711242181088, "learning_rate": 7.1887019754935775e-06, "loss": 0.0335, "step": 9336 }, { "epoch": 0.726932995961267, "grad_norm": 0.4506037619744231, "learning_rate": 7.184869147156863e-06, "loss": 0.0527, "step": 9337 }, { "epoch": 0.7270108510534767, "grad_norm": 0.506296564164014, "learning_rate": 7.181037117142562e-06, "loss": 0.0512, "step": 9338 }, { "epoch": 0.7270887061456863, "grad_norm": 0.5096490861110207, "learning_rate": 7.177205885689396e-06, "loss": 0.0496, "step": 9339 }, { "epoch": 0.7271665612378959, "grad_norm": 0.47381657173319797, "learning_rate": 7.1733754530360315e-06, "loss": 0.0449, "step": 9340 }, { "epoch": 0.7272444163301056, "grad_norm": 0.46777282036195467, "learning_rate": 7.169545819421085e-06, "loss": 0.0457, "step": 9341 }, { "epoch": 0.7273222714223152, "grad_norm": 0.5445288983495683, "learning_rate": 7.165716985083124e-06, "loss": 0.075, "step": 9342 }, { "epoch": 0.7274001265145248, "grad_norm": 0.44271218937608364, "learning_rate": 7.161888950260669e-06, "loss": 0.0434, "step": 9343 }, { "epoch": 0.7274779816067345, "grad_norm": 0.5577103326552305, "learning_rate": 7.158061715192189e-06, "loss": 0.0525, "step": 9344 }, { "epoch": 0.7275558366989441, "grad_norm": 0.4370375995057601, "learning_rate": 7.1542352801160995e-06, "loss": 0.0434, "step": 9345 }, { "epoch": 0.7276336917911537, "grad_norm": 0.44673426984403475, "learning_rate": 7.150409645270766e-06, "loss": 0.0414, "step": 9346 }, { "epoch": 0.7277115468833634, "grad_norm": 0.46941534167459326, "learning_rate": 7.146584810894521e-06, "loss": 0.0355, "step": 9347 }, { "epoch": 0.727789401975573, "grad_norm": 0.3940331581777299, "learning_rate": 7.142760777225626e-06, "loss": 0.0337, "step": 9348 }, { "epoch": 0.7278672570677825, "grad_norm": 0.4943391213226741, "learning_rate": 7.1389375445022975e-06, "loss": 0.0438, "step": 9349 }, { "epoch": 0.7279451121599922, "grad_norm": 0.6187705933733391, "learning_rate": 7.1351151129627184e-06, "loss": 0.0873, "step": 9350 }, { "epoch": 0.7279451121599922, "eval_loss": 0.006887961644679308, "eval_runtime": 166.6756, "eval_samples_per_second": 17.279, "eval_steps_per_second": 0.618, "step": 9350 }, { "epoch": 0.7280229672522018, "grad_norm": 0.4878113021041673, "learning_rate": 7.131293482844981e-06, "loss": 0.0534, "step": 9351 }, { "epoch": 0.7281008223444114, "grad_norm": 0.5409728553011531, "learning_rate": 7.12747265438718e-06, "loss": 0.0448, "step": 9352 }, { "epoch": 0.7281786774366211, "grad_norm": 0.4082288547023898, "learning_rate": 7.1236526278273245e-06, "loss": 0.0415, "step": 9353 }, { "epoch": 0.7282565325288307, "grad_norm": 0.4650716172351038, "learning_rate": 7.119833403403387e-06, "loss": 0.041, "step": 9354 }, { "epoch": 0.7283343876210403, "grad_norm": 0.47654473153898497, "learning_rate": 7.1160149813532874e-06, "loss": 0.0622, "step": 9355 }, { "epoch": 0.72841224271325, "grad_norm": 0.4429431723395804, "learning_rate": 7.112197361914886e-06, "loss": 0.046, "step": 9356 }, { "epoch": 0.7284900978054596, "grad_norm": 0.40110098315038606, "learning_rate": 7.108380545326019e-06, "loss": 0.0403, "step": 9357 }, { "epoch": 0.7285679528976692, "grad_norm": 0.4859904132703346, "learning_rate": 7.1045645318244476e-06, "loss": 0.0573, "step": 9358 }, { "epoch": 0.7286458079898789, "grad_norm": 0.44332478239070494, "learning_rate": 7.100749321647897e-06, "loss": 0.0556, "step": 9359 }, { "epoch": 0.7287236630820885, "grad_norm": 0.5136958199329567, "learning_rate": 7.0969349150340265e-06, "loss": 0.072, "step": 9360 }, { "epoch": 0.7288015181742981, "grad_norm": 0.5155635635174498, "learning_rate": 7.093121312220461e-06, "loss": 0.0594, "step": 9361 }, { "epoch": 0.7288793732665078, "grad_norm": 0.4545520719515129, "learning_rate": 7.089308513444763e-06, "loss": 0.0392, "step": 9362 }, { "epoch": 0.7289572283587173, "grad_norm": 0.4572705318765426, "learning_rate": 7.085496518944468e-06, "loss": 0.052, "step": 9363 }, { "epoch": 0.7290350834509269, "grad_norm": 0.5005600017355195, "learning_rate": 7.081685328957033e-06, "loss": 0.0558, "step": 9364 }, { "epoch": 0.7291129385431366, "grad_norm": 0.38653911403211394, "learning_rate": 7.077874943719882e-06, "loss": 0.0295, "step": 9365 }, { "epoch": 0.7291907936353462, "grad_norm": 0.46200078925455074, "learning_rate": 7.074065363470384e-06, "loss": 0.046, "step": 9366 }, { "epoch": 0.7292686487275558, "grad_norm": 0.4975631963937201, "learning_rate": 7.070256588445847e-06, "loss": 0.0475, "step": 9367 }, { "epoch": 0.7293465038197655, "grad_norm": 0.47584259640053433, "learning_rate": 7.066448618883566e-06, "loss": 0.0505, "step": 9368 }, { "epoch": 0.7294243589119751, "grad_norm": 0.481414438339353, "learning_rate": 7.062641455020734e-06, "loss": 0.0528, "step": 9369 }, { "epoch": 0.7295022140041847, "grad_norm": 0.4630471455427946, "learning_rate": 7.0588350970945295e-06, "loss": 0.0446, "step": 9370 }, { "epoch": 0.7295800690963944, "grad_norm": 0.56672857821909, "learning_rate": 7.055029545342069e-06, "loss": 0.0536, "step": 9371 }, { "epoch": 0.729657924188604, "grad_norm": 0.5200378412883223, "learning_rate": 7.051224800000414e-06, "loss": 0.0495, "step": 9372 }, { "epoch": 0.7297357792808136, "grad_norm": 0.5344062110434181, "learning_rate": 7.047420861306598e-06, "loss": 0.0634, "step": 9373 }, { "epoch": 0.7298136343730232, "grad_norm": 0.577447894412569, "learning_rate": 7.043617729497576e-06, "loss": 0.0767, "step": 9374 }, { "epoch": 0.7298914894652329, "grad_norm": 0.49422486365147755, "learning_rate": 7.039815404810273e-06, "loss": 0.0479, "step": 9375 }, { "epoch": 0.7299693445574424, "grad_norm": 0.4434295444123964, "learning_rate": 7.036013887481549e-06, "loss": 0.0405, "step": 9376 }, { "epoch": 0.730047199649652, "grad_norm": 0.4439195341247708, "learning_rate": 7.03221317774822e-06, "loss": 0.052, "step": 9377 }, { "epoch": 0.7301250547418617, "grad_norm": 0.6790073774660593, "learning_rate": 7.028413275847068e-06, "loss": 0.1004, "step": 9378 }, { "epoch": 0.7302029098340713, "grad_norm": 0.48144612954147054, "learning_rate": 7.0246141820147905e-06, "loss": 0.049, "step": 9379 }, { "epoch": 0.7302807649262809, "grad_norm": 0.5310168529560487, "learning_rate": 7.020815896488058e-06, "loss": 0.0612, "step": 9380 }, { "epoch": 0.7303586200184906, "grad_norm": 0.44454885905836494, "learning_rate": 7.01701841950349e-06, "loss": 0.0421, "step": 9381 }, { "epoch": 0.7304364751107002, "grad_norm": 0.5554199985632683, "learning_rate": 7.0132217512976405e-06, "loss": 0.0666, "step": 9382 }, { "epoch": 0.7305143302029098, "grad_norm": 0.4309289964183079, "learning_rate": 7.009425892107043e-06, "loss": 0.0418, "step": 9383 }, { "epoch": 0.7305921852951195, "grad_norm": 0.45431233997613396, "learning_rate": 7.005630842168147e-06, "loss": 0.0281, "step": 9384 }, { "epoch": 0.7306700403873291, "grad_norm": 0.6297605291396033, "learning_rate": 7.001836601717373e-06, "loss": 0.0686, "step": 9385 }, { "epoch": 0.7307478954795387, "grad_norm": 0.4406858426039234, "learning_rate": 6.9980431709910805e-06, "loss": 0.0362, "step": 9386 }, { "epoch": 0.7308257505717484, "grad_norm": 0.4561482500405424, "learning_rate": 6.994250550225586e-06, "loss": 0.0372, "step": 9387 }, { "epoch": 0.730903605663958, "grad_norm": 0.5132343421715299, "learning_rate": 6.9904587396571465e-06, "loss": 0.0571, "step": 9388 }, { "epoch": 0.7309814607561675, "grad_norm": 0.6194104938358314, "learning_rate": 6.986667739521979e-06, "loss": 0.0724, "step": 9389 }, { "epoch": 0.7310593158483772, "grad_norm": 0.3837337839209765, "learning_rate": 6.982877550056244e-06, "loss": 0.0327, "step": 9390 }, { "epoch": 0.7311371709405868, "grad_norm": 0.5842409039183663, "learning_rate": 6.9790881714960515e-06, "loss": 0.0667, "step": 9391 }, { "epoch": 0.7312150260327964, "grad_norm": 0.48769646271157907, "learning_rate": 6.975299604077454e-06, "loss": 0.0649, "step": 9392 }, { "epoch": 0.7312928811250061, "grad_norm": 0.4028188924091293, "learning_rate": 6.971511848036478e-06, "loss": 0.035, "step": 9393 }, { "epoch": 0.7313707362172157, "grad_norm": 0.4985108932161922, "learning_rate": 6.967724903609075e-06, "loss": 0.0447, "step": 9394 }, { "epoch": 0.7314485913094253, "grad_norm": 0.5732404399697156, "learning_rate": 6.9639387710311535e-06, "loss": 0.0844, "step": 9395 }, { "epoch": 0.731526446401635, "grad_norm": 0.5500887465950729, "learning_rate": 6.960153450538569e-06, "loss": 0.0586, "step": 9396 }, { "epoch": 0.7316043014938446, "grad_norm": 0.4239858088207758, "learning_rate": 6.956368942367136e-06, "loss": 0.0484, "step": 9397 }, { "epoch": 0.7316821565860542, "grad_norm": 0.47435327290967405, "learning_rate": 6.952585246752603e-06, "loss": 0.0516, "step": 9398 }, { "epoch": 0.7317600116782639, "grad_norm": 0.47532029210354476, "learning_rate": 6.948802363930685e-06, "loss": 0.0492, "step": 9399 }, { "epoch": 0.7318378667704735, "grad_norm": 0.49848078477840363, "learning_rate": 6.94502029413703e-06, "loss": 0.0472, "step": 9400 }, { "epoch": 0.7318378667704735, "eval_loss": 0.006867117714136839, "eval_runtime": 167.0013, "eval_samples_per_second": 17.245, "eval_steps_per_second": 0.617, "step": 9400 }, { "epoch": 0.731915721862683, "grad_norm": 0.4779474514531713, "learning_rate": 6.941239037607248e-06, "loss": 0.0477, "step": 9401 }, { "epoch": 0.7319935769548928, "grad_norm": 0.503111837767718, "learning_rate": 6.937458594576893e-06, "loss": 0.0701, "step": 9402 }, { "epoch": 0.7320714320471023, "grad_norm": 0.5317438672368187, "learning_rate": 6.933678965281461e-06, "loss": 0.0608, "step": 9403 }, { "epoch": 0.7321492871393119, "grad_norm": 0.38083630724102024, "learning_rate": 6.929900149956419e-06, "loss": 0.0288, "step": 9404 }, { "epoch": 0.7322271422315216, "grad_norm": 0.4978269041703973, "learning_rate": 6.926122148837164e-06, "loss": 0.0552, "step": 9405 }, { "epoch": 0.7323049973237312, "grad_norm": 0.5074140362221244, "learning_rate": 6.922344962159044e-06, "loss": 0.0432, "step": 9406 }, { "epoch": 0.7323828524159408, "grad_norm": 0.39309027963135823, "learning_rate": 6.918568590157366e-06, "loss": 0.0376, "step": 9407 }, { "epoch": 0.7324607075081505, "grad_norm": 0.5419506334942529, "learning_rate": 6.914793033067373e-06, "loss": 0.0631, "step": 9408 }, { "epoch": 0.7325385626003601, "grad_norm": 0.4856403173668405, "learning_rate": 6.9110182911242715e-06, "loss": 0.0544, "step": 9409 }, { "epoch": 0.7326164176925697, "grad_norm": 0.5296243178636573, "learning_rate": 6.907244364563203e-06, "loss": 0.0512, "step": 9410 }, { "epoch": 0.7326942727847793, "grad_norm": 0.5058093520328236, "learning_rate": 6.9034712536192716e-06, "loss": 0.0519, "step": 9411 }, { "epoch": 0.732772127876989, "grad_norm": 0.4529700360955611, "learning_rate": 6.899698958527523e-06, "loss": 0.0491, "step": 9412 }, { "epoch": 0.7328499829691986, "grad_norm": 0.5022258110314763, "learning_rate": 6.8959274795229435e-06, "loss": 0.0587, "step": 9413 }, { "epoch": 0.7329278380614082, "grad_norm": 0.4300408261658626, "learning_rate": 6.892156816840496e-06, "loss": 0.0407, "step": 9414 }, { "epoch": 0.7330056931536179, "grad_norm": 0.43793013023395333, "learning_rate": 6.888386970715064e-06, "loss": 0.0342, "step": 9415 }, { "epoch": 0.7330835482458274, "grad_norm": 0.453629481525925, "learning_rate": 6.884617941381498e-06, "loss": 0.0353, "step": 9416 }, { "epoch": 0.733161403338037, "grad_norm": 0.4267149440594045, "learning_rate": 6.880849729074588e-06, "loss": 0.0403, "step": 9417 }, { "epoch": 0.7332392584302467, "grad_norm": 0.4870558592167661, "learning_rate": 6.877082334029064e-06, "loss": 0.0436, "step": 9418 }, { "epoch": 0.7333171135224563, "grad_norm": 0.43865346607437533, "learning_rate": 6.873315756479635e-06, "loss": 0.0454, "step": 9419 }, { "epoch": 0.7333949686146659, "grad_norm": 0.45165314900401865, "learning_rate": 6.869549996660931e-06, "loss": 0.042, "step": 9420 }, { "epoch": 0.7334728237068756, "grad_norm": 0.5257731467114334, "learning_rate": 6.865785054807546e-06, "loss": 0.072, "step": 9421 }, { "epoch": 0.7335506787990852, "grad_norm": 0.4270992027915333, "learning_rate": 6.862020931154015e-06, "loss": 0.0435, "step": 9422 }, { "epoch": 0.7336285338912948, "grad_norm": 0.43050747159090624, "learning_rate": 6.858257625934817e-06, "loss": 0.0353, "step": 9423 }, { "epoch": 0.7337063889835045, "grad_norm": 0.47443314727068325, "learning_rate": 6.854495139384405e-06, "loss": 0.0534, "step": 9424 }, { "epoch": 0.7337842440757141, "grad_norm": 0.510005770512555, "learning_rate": 6.850733471737159e-06, "loss": 0.0555, "step": 9425 }, { "epoch": 0.7338620991679237, "grad_norm": 0.421443178056496, "learning_rate": 6.846972623227415e-06, "loss": 0.0389, "step": 9426 }, { "epoch": 0.7339399542601334, "grad_norm": 0.36677221478124605, "learning_rate": 6.8432125940894455e-06, "loss": 0.0292, "step": 9427 }, { "epoch": 0.734017809352343, "grad_norm": 0.4968128527088345, "learning_rate": 6.839453384557484e-06, "loss": 0.0534, "step": 9428 }, { "epoch": 0.7340956644445525, "grad_norm": 0.466275512752984, "learning_rate": 6.835694994865723e-06, "loss": 0.0392, "step": 9429 }, { "epoch": 0.7341735195367622, "grad_norm": 0.6094873887787555, "learning_rate": 6.831937425248283e-06, "loss": 0.0647, "step": 9430 }, { "epoch": 0.7342513746289718, "grad_norm": 0.4061492725814327, "learning_rate": 6.828180675939249e-06, "loss": 0.0365, "step": 9431 }, { "epoch": 0.7343292297211814, "grad_norm": 0.40175735391642625, "learning_rate": 6.824424747172644e-06, "loss": 0.0315, "step": 9432 }, { "epoch": 0.7344070848133911, "grad_norm": 0.622983102147684, "learning_rate": 6.820669639182443e-06, "loss": 0.0614, "step": 9433 }, { "epoch": 0.7344849399056007, "grad_norm": 0.4472096494021232, "learning_rate": 6.81691535220258e-06, "loss": 0.0426, "step": 9434 }, { "epoch": 0.7345627949978103, "grad_norm": 0.5088286868113153, "learning_rate": 6.813161886466931e-06, "loss": 0.0573, "step": 9435 }, { "epoch": 0.73464065009002, "grad_norm": 0.45238399055887607, "learning_rate": 6.809409242209306e-06, "loss": 0.0467, "step": 9436 }, { "epoch": 0.7347185051822296, "grad_norm": 0.4525473115948504, "learning_rate": 6.805657419663483e-06, "loss": 0.0362, "step": 9437 }, { "epoch": 0.7347963602744392, "grad_norm": 0.3757024055381322, "learning_rate": 6.801906419063184e-06, "loss": 0.0294, "step": 9438 }, { "epoch": 0.7348742153666489, "grad_norm": 0.433886546857179, "learning_rate": 6.798156240642076e-06, "loss": 0.0411, "step": 9439 }, { "epoch": 0.7349520704588585, "grad_norm": 0.42620890699524144, "learning_rate": 6.794406884633782e-06, "loss": 0.0318, "step": 9440 }, { "epoch": 0.735029925551068, "grad_norm": 0.5693852631303843, "learning_rate": 6.790658351271871e-06, "loss": 0.0635, "step": 9441 }, { "epoch": 0.7351077806432778, "grad_norm": 0.5349241309525621, "learning_rate": 6.786910640789852e-06, "loss": 0.0697, "step": 9442 }, { "epoch": 0.7351856357354873, "grad_norm": 0.48082070326666976, "learning_rate": 6.783163753421194e-06, "loss": 0.0515, "step": 9443 }, { "epoch": 0.7352634908276969, "grad_norm": 0.6025764675649083, "learning_rate": 6.77941768939931e-06, "loss": 0.0664, "step": 9444 }, { "epoch": 0.7353413459199065, "grad_norm": 0.5683305654653283, "learning_rate": 6.775672448957562e-06, "loss": 0.0593, "step": 9445 }, { "epoch": 0.7354192010121162, "grad_norm": 0.3637667567237369, "learning_rate": 6.77192803232926e-06, "loss": 0.0277, "step": 9446 }, { "epoch": 0.7354970561043258, "grad_norm": 0.509192520452385, "learning_rate": 6.768184439747665e-06, "loss": 0.0519, "step": 9447 }, { "epoch": 0.7355749111965354, "grad_norm": 0.4637299636134213, "learning_rate": 6.764441671445982e-06, "loss": 0.0392, "step": 9448 }, { "epoch": 0.7356527662887451, "grad_norm": 0.40685106823385037, "learning_rate": 6.760699727657367e-06, "loss": 0.0342, "step": 9449 }, { "epoch": 0.7357306213809547, "grad_norm": 0.43793543231020304, "learning_rate": 6.7569586086149345e-06, "loss": 0.0458, "step": 9450 }, { "epoch": 0.7357306213809547, "eval_loss": 0.00678621418774128, "eval_runtime": 167.2074, "eval_samples_per_second": 17.224, "eval_steps_per_second": 0.616, "step": 9450 }, { "epoch": 0.7358084764731643, "grad_norm": 0.48546847417110683, "learning_rate": 6.753218314551733e-06, "loss": 0.0517, "step": 9451 }, { "epoch": 0.735886331565374, "grad_norm": 0.4911524016636711, "learning_rate": 6.749478845700763e-06, "loss": 0.0446, "step": 9452 }, { "epoch": 0.7359641866575836, "grad_norm": 0.47668630676854973, "learning_rate": 6.745740202294979e-06, "loss": 0.0432, "step": 9453 }, { "epoch": 0.7360420417497932, "grad_norm": 0.4110259139210899, "learning_rate": 6.742002384567281e-06, "loss": 0.034, "step": 9454 }, { "epoch": 0.7361198968420029, "grad_norm": 0.46498334306302475, "learning_rate": 6.738265392750516e-06, "loss": 0.0434, "step": 9455 }, { "epoch": 0.7361977519342124, "grad_norm": 0.5984345227950805, "learning_rate": 6.73452922707748e-06, "loss": 0.0509, "step": 9456 }, { "epoch": 0.736275607026422, "grad_norm": 0.568533613706197, "learning_rate": 6.730793887780922e-06, "loss": 0.0843, "step": 9457 }, { "epoch": 0.7363534621186317, "grad_norm": 0.4086307125665333, "learning_rate": 6.727059375093532e-06, "loss": 0.0392, "step": 9458 }, { "epoch": 0.7364313172108413, "grad_norm": 0.4986753979337271, "learning_rate": 6.7233256892479485e-06, "loss": 0.0513, "step": 9459 }, { "epoch": 0.7365091723030509, "grad_norm": 0.44050161423291223, "learning_rate": 6.719592830476776e-06, "loss": 0.0409, "step": 9460 }, { "epoch": 0.7365870273952606, "grad_norm": 0.6268591086385793, "learning_rate": 6.7158607990125455e-06, "loss": 0.0842, "step": 9461 }, { "epoch": 0.7366648824874702, "grad_norm": 0.4561217663257433, "learning_rate": 6.712129595087749e-06, "loss": 0.0531, "step": 9462 }, { "epoch": 0.7367427375796798, "grad_norm": 0.4527437560873192, "learning_rate": 6.708399218934818e-06, "loss": 0.0605, "step": 9463 }, { "epoch": 0.7368205926718895, "grad_norm": 0.4782112277277075, "learning_rate": 6.7046696707861396e-06, "loss": 0.0555, "step": 9464 }, { "epoch": 0.7368984477640991, "grad_norm": 0.4744940675824017, "learning_rate": 6.700940950874048e-06, "loss": 0.0463, "step": 9465 }, { "epoch": 0.7369763028563087, "grad_norm": 0.6196525289927232, "learning_rate": 6.697213059430823e-06, "loss": 0.0848, "step": 9466 }, { "epoch": 0.7370541579485184, "grad_norm": 0.45458753757841, "learning_rate": 6.693485996688695e-06, "loss": 0.0414, "step": 9467 }, { "epoch": 0.737132013040728, "grad_norm": 0.7103769969470246, "learning_rate": 6.689759762879846e-06, "loss": 0.1203, "step": 9468 }, { "epoch": 0.7372098681329375, "grad_norm": 0.4458205752995641, "learning_rate": 6.686034358236393e-06, "loss": 0.0373, "step": 9469 }, { "epoch": 0.7372877232251472, "grad_norm": 0.47208650807500246, "learning_rate": 6.682309782990423e-06, "loss": 0.045, "step": 9470 }, { "epoch": 0.7373655783173568, "grad_norm": 0.4650953868599734, "learning_rate": 6.678586037373956e-06, "loss": 0.0466, "step": 9471 }, { "epoch": 0.7374434334095664, "grad_norm": 0.40939480807767503, "learning_rate": 6.6748631216189615e-06, "loss": 0.0299, "step": 9472 }, { "epoch": 0.7375212885017761, "grad_norm": 0.5460657852565388, "learning_rate": 6.671141035957363e-06, "loss": 0.0514, "step": 9473 }, { "epoch": 0.7375991435939857, "grad_norm": 0.48140826509587314, "learning_rate": 6.667419780621027e-06, "loss": 0.048, "step": 9474 }, { "epoch": 0.7376769986861953, "grad_norm": 0.5042076111056155, "learning_rate": 6.663699355841768e-06, "loss": 0.0384, "step": 9475 }, { "epoch": 0.737754853778405, "grad_norm": 0.42305956657320937, "learning_rate": 6.659979761851354e-06, "loss": 0.0418, "step": 9476 }, { "epoch": 0.7378327088706146, "grad_norm": 0.5918372237027143, "learning_rate": 6.6562609988814965e-06, "loss": 0.0726, "step": 9477 }, { "epoch": 0.7379105639628242, "grad_norm": 0.42674765204030796, "learning_rate": 6.652543067163859e-06, "loss": 0.0378, "step": 9478 }, { "epoch": 0.7379884190550339, "grad_norm": 0.4986639088324783, "learning_rate": 6.648825966930048e-06, "loss": 0.0508, "step": 9479 }, { "epoch": 0.7380662741472435, "grad_norm": 0.5047315329136853, "learning_rate": 6.645109698411618e-06, "loss": 0.0719, "step": 9480 }, { "epoch": 0.738144129239453, "grad_norm": 0.5110789105996643, "learning_rate": 6.641394261840086e-06, "loss": 0.055, "step": 9481 }, { "epoch": 0.7382219843316626, "grad_norm": 0.444363919948425, "learning_rate": 6.637679657446901e-06, "loss": 0.0458, "step": 9482 }, { "epoch": 0.7382998394238723, "grad_norm": 0.48136982499519726, "learning_rate": 6.633965885463464e-06, "loss": 0.0498, "step": 9483 }, { "epoch": 0.7383776945160819, "grad_norm": 0.4743371131488717, "learning_rate": 6.630252946121134e-06, "loss": 0.0499, "step": 9484 }, { "epoch": 0.7384555496082915, "grad_norm": 0.5788236159275466, "learning_rate": 6.62654083965119e-06, "loss": 0.0658, "step": 9485 }, { "epoch": 0.7385334047005012, "grad_norm": 0.4573797460381964, "learning_rate": 6.622829566284896e-06, "loss": 0.0408, "step": 9486 }, { "epoch": 0.7386112597927108, "grad_norm": 0.4327526951473891, "learning_rate": 6.6191191262534416e-06, "loss": 0.0409, "step": 9487 }, { "epoch": 0.7386891148849204, "grad_norm": 0.4966355583291538, "learning_rate": 6.61540951978797e-06, "loss": 0.0483, "step": 9488 }, { "epoch": 0.7387669699771301, "grad_norm": 0.4634971315045016, "learning_rate": 6.611700747119571e-06, "loss": 0.0472, "step": 9489 }, { "epoch": 0.7388448250693397, "grad_norm": 0.4438144704932606, "learning_rate": 6.607992808479282e-06, "loss": 0.038, "step": 9490 }, { "epoch": 0.7389226801615493, "grad_norm": 0.44498671914541044, "learning_rate": 6.604285704098097e-06, "loss": 0.0406, "step": 9491 }, { "epoch": 0.739000535253759, "grad_norm": 0.5249769425606999, "learning_rate": 6.600579434206946e-06, "loss": 0.0421, "step": 9492 }, { "epoch": 0.7390783903459686, "grad_norm": 0.5572239564425584, "learning_rate": 6.596873999036724e-06, "loss": 0.0614, "step": 9493 }, { "epoch": 0.7391562454381781, "grad_norm": 0.5884702408283795, "learning_rate": 6.593169398818242e-06, "loss": 0.0672, "step": 9494 }, { "epoch": 0.7392341005303878, "grad_norm": 0.5316608926553897, "learning_rate": 6.589465633782282e-06, "loss": 0.0479, "step": 9495 }, { "epoch": 0.7393119556225974, "grad_norm": 0.5769252632371392, "learning_rate": 6.585762704159586e-06, "loss": 0.0679, "step": 9496 }, { "epoch": 0.739389810714807, "grad_norm": 0.48122823224873185, "learning_rate": 6.5820606101808205e-06, "loss": 0.0437, "step": 9497 }, { "epoch": 0.7394676658070167, "grad_norm": 0.4982051622032815, "learning_rate": 6.5783593520766085e-06, "loss": 0.0469, "step": 9498 }, { "epoch": 0.7395455208992263, "grad_norm": 0.46082928842326437, "learning_rate": 6.574658930077522e-06, "loss": 0.0472, "step": 9499 }, { "epoch": 0.7396233759914359, "grad_norm": 0.5516650283472564, "learning_rate": 6.570959344414074e-06, "loss": 0.0668, "step": 9500 }, { "epoch": 0.7396233759914359, "eval_loss": 0.006765433587133884, "eval_runtime": 166.9951, "eval_samples_per_second": 17.246, "eval_steps_per_second": 0.617, "step": 9500 }, { "epoch": 0.7397012310836456, "grad_norm": 0.6045860622717235, "learning_rate": 6.567260595316742e-06, "loss": 0.0791, "step": 9501 }, { "epoch": 0.7397790861758552, "grad_norm": 0.4160206308217441, "learning_rate": 6.563562683015942e-06, "loss": 0.0341, "step": 9502 }, { "epoch": 0.7398569412680648, "grad_norm": 0.5159677520347624, "learning_rate": 6.5598656077420245e-06, "loss": 0.0563, "step": 9503 }, { "epoch": 0.7399347963602745, "grad_norm": 0.622511510366809, "learning_rate": 6.556169369725305e-06, "loss": 0.0733, "step": 9504 }, { "epoch": 0.7400126514524841, "grad_norm": 0.47377674404887327, "learning_rate": 6.552473969196033e-06, "loss": 0.0441, "step": 9505 }, { "epoch": 0.7400905065446937, "grad_norm": 0.48658953252632814, "learning_rate": 6.5487794063844335e-06, "loss": 0.0509, "step": 9506 }, { "epoch": 0.7401683616369034, "grad_norm": 0.43708351696202075, "learning_rate": 6.545085681520651e-06, "loss": 0.0389, "step": 9507 }, { "epoch": 0.740246216729113, "grad_norm": 0.527988723001295, "learning_rate": 6.5413927948347845e-06, "loss": 0.0584, "step": 9508 }, { "epoch": 0.7403240718213225, "grad_norm": 0.4603265612890622, "learning_rate": 6.537700746556888e-06, "loss": 0.053, "step": 9509 }, { "epoch": 0.7404019269135322, "grad_norm": 0.45033007821409987, "learning_rate": 6.534009536916947e-06, "loss": 0.0494, "step": 9510 }, { "epoch": 0.7404797820057418, "grad_norm": 0.5131958724567467, "learning_rate": 6.5303191661449335e-06, "loss": 0.0628, "step": 9511 }, { "epoch": 0.7405576370979514, "grad_norm": 0.42728137560823287, "learning_rate": 6.526629634470714e-06, "loss": 0.0521, "step": 9512 }, { "epoch": 0.7406354921901611, "grad_norm": 0.4657124755087115, "learning_rate": 6.522940942124136e-06, "loss": 0.0431, "step": 9513 }, { "epoch": 0.7407133472823707, "grad_norm": 0.4242423581242162, "learning_rate": 6.519253089334994e-06, "loss": 0.0339, "step": 9514 }, { "epoch": 0.7407912023745803, "grad_norm": 0.43044913062494017, "learning_rate": 6.515566076333015e-06, "loss": 0.0369, "step": 9515 }, { "epoch": 0.7408690574667899, "grad_norm": 0.3610111891113816, "learning_rate": 6.511879903347884e-06, "loss": 0.0245, "step": 9516 }, { "epoch": 0.7409469125589996, "grad_norm": 0.46174258772717985, "learning_rate": 6.508194570609239e-06, "loss": 0.0438, "step": 9517 }, { "epoch": 0.7410247676512092, "grad_norm": 0.4902487266243001, "learning_rate": 6.504510078346657e-06, "loss": 0.046, "step": 9518 }, { "epoch": 0.7411026227434188, "grad_norm": 0.4841975248338405, "learning_rate": 6.500826426789661e-06, "loss": 0.0563, "step": 9519 }, { "epoch": 0.7411804778356285, "grad_norm": 0.4331801140186708, "learning_rate": 6.497143616167729e-06, "loss": 0.0447, "step": 9520 }, { "epoch": 0.741258332927838, "grad_norm": 0.5906648085101708, "learning_rate": 6.493461646710279e-06, "loss": 0.0774, "step": 9521 }, { "epoch": 0.7413361880200476, "grad_norm": 0.47810401456869045, "learning_rate": 6.4897805186466825e-06, "loss": 0.0436, "step": 9522 }, { "epoch": 0.7414140431122573, "grad_norm": 0.5204869432245127, "learning_rate": 6.486100232206256e-06, "loss": 0.0584, "step": 9523 }, { "epoch": 0.7414918982044669, "grad_norm": 0.5481786968542159, "learning_rate": 6.482420787618265e-06, "loss": 0.056, "step": 9524 }, { "epoch": 0.7415697532966765, "grad_norm": 0.5369142463675098, "learning_rate": 6.478742185111919e-06, "loss": 0.0564, "step": 9525 }, { "epoch": 0.7416476083888862, "grad_norm": 0.5586149017986456, "learning_rate": 6.475064424916375e-06, "loss": 0.0716, "step": 9526 }, { "epoch": 0.7417254634810958, "grad_norm": 0.513436141800718, "learning_rate": 6.471387507260749e-06, "loss": 0.0585, "step": 9527 }, { "epoch": 0.7418033185733054, "grad_norm": 0.46695398982506253, "learning_rate": 6.467711432374093e-06, "loss": 0.0419, "step": 9528 }, { "epoch": 0.7418811736655151, "grad_norm": 0.44753198141417627, "learning_rate": 6.464036200485406e-06, "loss": 0.0462, "step": 9529 }, { "epoch": 0.7419590287577247, "grad_norm": 0.38125009330849247, "learning_rate": 6.460361811823639e-06, "loss": 0.0319, "step": 9530 }, { "epoch": 0.7420368838499343, "grad_norm": 0.5013992433345452, "learning_rate": 6.45668826661769e-06, "loss": 0.0503, "step": 9531 }, { "epoch": 0.742114738942144, "grad_norm": 0.3965035905678252, "learning_rate": 6.453015565096403e-06, "loss": 0.024, "step": 9532 }, { "epoch": 0.7421925940343536, "grad_norm": 0.4994141773737944, "learning_rate": 6.449343707488569e-06, "loss": 0.0421, "step": 9533 }, { "epoch": 0.7422704491265631, "grad_norm": 0.5582709133778575, "learning_rate": 6.4456726940229286e-06, "loss": 0.0667, "step": 9534 }, { "epoch": 0.7423483042187728, "grad_norm": 0.43693162826573734, "learning_rate": 6.442002524928168e-06, "loss": 0.0511, "step": 9535 }, { "epoch": 0.7424261593109824, "grad_norm": 0.5196215245127933, "learning_rate": 6.438333200432916e-06, "loss": 0.0587, "step": 9536 }, { "epoch": 0.742504014403192, "grad_norm": 0.41400928683404814, "learning_rate": 6.434664720765766e-06, "loss": 0.0223, "step": 9537 }, { "epoch": 0.7425818694954017, "grad_norm": 0.4721086264758295, "learning_rate": 6.430997086155241e-06, "loss": 0.0492, "step": 9538 }, { "epoch": 0.7426597245876113, "grad_norm": 0.44666507147983786, "learning_rate": 6.427330296829821e-06, "loss": 0.0233, "step": 9539 }, { "epoch": 0.7427375796798209, "grad_norm": 0.473279058408922, "learning_rate": 6.423664353017922e-06, "loss": 0.0427, "step": 9540 }, { "epoch": 0.7428154347720306, "grad_norm": 0.5061033750893322, "learning_rate": 6.419999254947922e-06, "loss": 0.0507, "step": 9541 }, { "epoch": 0.7428932898642402, "grad_norm": 0.41555342305786463, "learning_rate": 6.416335002848135e-06, "loss": 0.0369, "step": 9542 }, { "epoch": 0.7429711449564498, "grad_norm": 0.48301260173656657, "learning_rate": 6.412671596946829e-06, "loss": 0.0498, "step": 9543 }, { "epoch": 0.7430490000486595, "grad_norm": 0.45194977406441644, "learning_rate": 6.409009037472218e-06, "loss": 0.0326, "step": 9544 }, { "epoch": 0.7431268551408691, "grad_norm": 0.44902824052377127, "learning_rate": 6.405347324652458e-06, "loss": 0.0394, "step": 9545 }, { "epoch": 0.7432047102330787, "grad_norm": 0.4229022915531064, "learning_rate": 6.401686458715655e-06, "loss": 0.0378, "step": 9546 }, { "epoch": 0.7432825653252884, "grad_norm": 0.5560925655603554, "learning_rate": 6.398026439889873e-06, "loss": 0.056, "step": 9547 }, { "epoch": 0.743360420417498, "grad_norm": 0.42944979561609775, "learning_rate": 6.39436726840311e-06, "loss": 0.0328, "step": 9548 }, { "epoch": 0.7434382755097075, "grad_norm": 0.4532179147872709, "learning_rate": 6.390708944483313e-06, "loss": 0.0347, "step": 9549 }, { "epoch": 0.7435161306019172, "grad_norm": 0.4834657611873114, "learning_rate": 6.38705146835838e-06, "loss": 0.0357, "step": 9550 }, { "epoch": 0.7435161306019172, "eval_loss": 0.0067780716344714165, "eval_runtime": 166.7472, "eval_samples_per_second": 17.272, "eval_steps_per_second": 0.618, "step": 9550 }, { "epoch": 0.7435939856941268, "grad_norm": 0.524525816752159, "learning_rate": 6.383394840256154e-06, "loss": 0.0529, "step": 9551 }, { "epoch": 0.7436718407863364, "grad_norm": 0.44674350779405253, "learning_rate": 6.379739060404424e-06, "loss": 0.0375, "step": 9552 }, { "epoch": 0.743749695878546, "grad_norm": 0.5475837063709668, "learning_rate": 6.376084129030933e-06, "loss": 0.0426, "step": 9553 }, { "epoch": 0.7438275509707557, "grad_norm": 0.4783770568355624, "learning_rate": 6.37243004636336e-06, "loss": 0.0298, "step": 9554 }, { "epoch": 0.7439054060629653, "grad_norm": 0.46360075148702024, "learning_rate": 6.3687768126293405e-06, "loss": 0.0542, "step": 9555 }, { "epoch": 0.7439832611551749, "grad_norm": 0.5333568098167226, "learning_rate": 6.3651244280564526e-06, "loss": 0.0637, "step": 9556 }, { "epoch": 0.7440611162473846, "grad_norm": 0.4245063383104033, "learning_rate": 6.361472892872218e-06, "loss": 0.0365, "step": 9557 }, { "epoch": 0.7441389713395942, "grad_norm": 0.3881818793097938, "learning_rate": 6.357822207304123e-06, "loss": 0.0303, "step": 9558 }, { "epoch": 0.7442168264318038, "grad_norm": 0.4442856217371147, "learning_rate": 6.354172371579579e-06, "loss": 0.0428, "step": 9559 }, { "epoch": 0.7442946815240135, "grad_norm": 0.4308141642789819, "learning_rate": 6.350523385925962e-06, "loss": 0.0384, "step": 9560 }, { "epoch": 0.744372536616223, "grad_norm": 0.4813286409216783, "learning_rate": 6.346875250570572e-06, "loss": 0.054, "step": 9561 }, { "epoch": 0.7444503917084326, "grad_norm": 0.4000265650969341, "learning_rate": 6.343227965740673e-06, "loss": 0.0312, "step": 9562 }, { "epoch": 0.7445282468006423, "grad_norm": 0.39744502528503456, "learning_rate": 6.339581531663485e-06, "loss": 0.0239, "step": 9563 }, { "epoch": 0.7446061018928519, "grad_norm": 0.4872240408064262, "learning_rate": 6.335935948566161e-06, "loss": 0.0417, "step": 9564 }, { "epoch": 0.7446839569850615, "grad_norm": 0.5573975398712523, "learning_rate": 6.332291216675797e-06, "loss": 0.0549, "step": 9565 }, { "epoch": 0.7447618120772712, "grad_norm": 0.5371124518499534, "learning_rate": 6.328647336219449e-06, "loss": 0.0543, "step": 9566 }, { "epoch": 0.7448396671694808, "grad_norm": 0.44659276955260313, "learning_rate": 6.325004307424103e-06, "loss": 0.0374, "step": 9567 }, { "epoch": 0.7449175222616904, "grad_norm": 0.6146627799553992, "learning_rate": 6.321362130516717e-06, "loss": 0.0697, "step": 9568 }, { "epoch": 0.7449953773539001, "grad_norm": 0.6014314718162214, "learning_rate": 6.3177208057241815e-06, "loss": 0.0623, "step": 9569 }, { "epoch": 0.7450732324461097, "grad_norm": 0.5213573035646655, "learning_rate": 6.3140803332733205e-06, "loss": 0.06, "step": 9570 }, { "epoch": 0.7451510875383193, "grad_norm": 0.4333700118918107, "learning_rate": 6.310440713390926e-06, "loss": 0.0455, "step": 9571 }, { "epoch": 0.745228942630529, "grad_norm": 0.42486674212578807, "learning_rate": 6.306801946303722e-06, "loss": 0.0444, "step": 9572 }, { "epoch": 0.7453067977227386, "grad_norm": 0.4723175192469607, "learning_rate": 6.3031640322384e-06, "loss": 0.043, "step": 9573 }, { "epoch": 0.7453846528149481, "grad_norm": 0.6253936676279518, "learning_rate": 6.299526971421577e-06, "loss": 0.0792, "step": 9574 }, { "epoch": 0.7454625079071578, "grad_norm": 0.4277229175359644, "learning_rate": 6.295890764079826e-06, "loss": 0.0364, "step": 9575 }, { "epoch": 0.7455403629993674, "grad_norm": 0.4386522287060852, "learning_rate": 6.292255410439665e-06, "loss": 0.0435, "step": 9576 }, { "epoch": 0.745618218091577, "grad_norm": 0.4506893596969916, "learning_rate": 6.288620910727554e-06, "loss": 0.0483, "step": 9577 }, { "epoch": 0.7456960731837867, "grad_norm": 0.4854418314736612, "learning_rate": 6.284987265169924e-06, "loss": 0.0507, "step": 9578 }, { "epoch": 0.7457739282759963, "grad_norm": 0.530777166741782, "learning_rate": 6.281354473993116e-06, "loss": 0.0642, "step": 9579 }, { "epoch": 0.7458517833682059, "grad_norm": 0.6511765103542623, "learning_rate": 6.2777225374234385e-06, "loss": 0.0853, "step": 9580 }, { "epoch": 0.7459296384604156, "grad_norm": 0.41804318331874346, "learning_rate": 6.274091455687148e-06, "loss": 0.04, "step": 9581 }, { "epoch": 0.7460074935526252, "grad_norm": 0.490246741354547, "learning_rate": 6.270461229010434e-06, "loss": 0.0433, "step": 9582 }, { "epoch": 0.7460853486448348, "grad_norm": 0.42108416362351364, "learning_rate": 6.2668318576194595e-06, "loss": 0.0334, "step": 9583 }, { "epoch": 0.7461632037370445, "grad_norm": 0.5822357082929754, "learning_rate": 6.263203341740307e-06, "loss": 0.0828, "step": 9584 }, { "epoch": 0.7462410588292541, "grad_norm": 0.4777807008024036, "learning_rate": 6.2595756815990195e-06, "loss": 0.0491, "step": 9585 }, { "epoch": 0.7463189139214637, "grad_norm": 0.49485856437985315, "learning_rate": 6.25594887742158e-06, "loss": 0.0695, "step": 9586 }, { "epoch": 0.7463967690136734, "grad_norm": 0.41624715347919855, "learning_rate": 6.252322929433923e-06, "loss": 0.0376, "step": 9587 }, { "epoch": 0.746474624105883, "grad_norm": 0.4729721216292265, "learning_rate": 6.248697837861928e-06, "loss": 0.0538, "step": 9588 }, { "epoch": 0.7465524791980925, "grad_norm": 0.5427693387399563, "learning_rate": 6.24507360293142e-06, "loss": 0.0732, "step": 9589 }, { "epoch": 0.7466303342903021, "grad_norm": 0.48231116738460783, "learning_rate": 6.241450224868173e-06, "loss": 0.053, "step": 9590 }, { "epoch": 0.7467081893825118, "grad_norm": 0.516843168804184, "learning_rate": 6.2378277038979075e-06, "loss": 0.0498, "step": 9591 }, { "epoch": 0.7467860444747214, "grad_norm": 0.4652225459645319, "learning_rate": 6.234206040246287e-06, "loss": 0.0531, "step": 9592 }, { "epoch": 0.746863899566931, "grad_norm": 0.4749738151239165, "learning_rate": 6.23058523413892e-06, "loss": 0.056, "step": 9593 }, { "epoch": 0.7469417546591407, "grad_norm": 0.40360746220624094, "learning_rate": 6.226965285801374e-06, "loss": 0.0348, "step": 9594 }, { "epoch": 0.7470196097513503, "grad_norm": 0.45104400818602863, "learning_rate": 6.223346195459155e-06, "loss": 0.0489, "step": 9595 }, { "epoch": 0.7470974648435599, "grad_norm": 0.3812044346094998, "learning_rate": 6.219727963337712e-06, "loss": 0.0405, "step": 9596 }, { "epoch": 0.7471753199357696, "grad_norm": 0.4870701350148991, "learning_rate": 6.216110589662441e-06, "loss": 0.0449, "step": 9597 }, { "epoch": 0.7472531750279792, "grad_norm": 0.4319824812418237, "learning_rate": 6.212494074658692e-06, "loss": 0.0512, "step": 9598 }, { "epoch": 0.7473310301201888, "grad_norm": 0.5244458072941602, "learning_rate": 6.208878418551756e-06, "loss": 0.0579, "step": 9599 }, { "epoch": 0.7474088852123985, "grad_norm": 0.47461789727145476, "learning_rate": 6.205263621566871e-06, "loss": 0.055, "step": 9600 }, { "epoch": 0.7474088852123985, "eval_loss": 0.006570535711944103, "eval_runtime": 167.7226, "eval_samples_per_second": 17.171, "eval_steps_per_second": 0.614, "step": 9600 }, { "epoch": 0.747486740304608, "grad_norm": 0.5345280562773518, "learning_rate": 6.201649683929219e-06, "loss": 0.0665, "step": 9601 }, { "epoch": 0.7475645953968176, "grad_norm": 0.40639080927837273, "learning_rate": 6.198036605863935e-06, "loss": 0.0396, "step": 9602 }, { "epoch": 0.7476424504890273, "grad_norm": 0.5053055479257726, "learning_rate": 6.194424387596089e-06, "loss": 0.0602, "step": 9603 }, { "epoch": 0.7477203055812369, "grad_norm": 0.5488322113679427, "learning_rate": 6.190813029350717e-06, "loss": 0.0414, "step": 9604 }, { "epoch": 0.7477981606734465, "grad_norm": 0.5644024857471485, "learning_rate": 6.187202531352787e-06, "loss": 0.059, "step": 9605 }, { "epoch": 0.7478760157656562, "grad_norm": 0.4072238192951751, "learning_rate": 6.183592893827211e-06, "loss": 0.0311, "step": 9606 }, { "epoch": 0.7479538708578658, "grad_norm": 0.531378699424525, "learning_rate": 6.1799841169988565e-06, "loss": 0.0543, "step": 9607 }, { "epoch": 0.7480317259500754, "grad_norm": 0.4931687389721327, "learning_rate": 6.176376201092531e-06, "loss": 0.0536, "step": 9608 }, { "epoch": 0.7481095810422851, "grad_norm": 0.4855214311256932, "learning_rate": 6.172769146332991e-06, "loss": 0.0473, "step": 9609 }, { "epoch": 0.7481874361344947, "grad_norm": 0.4860508155281277, "learning_rate": 6.16916295294494e-06, "loss": 0.05, "step": 9610 }, { "epoch": 0.7482652912267043, "grad_norm": 0.3954904046279034, "learning_rate": 6.165557621153027e-06, "loss": 0.0376, "step": 9611 }, { "epoch": 0.748343146318914, "grad_norm": 0.49313622102034116, "learning_rate": 6.161953151181846e-06, "loss": 0.048, "step": 9612 }, { "epoch": 0.7484210014111236, "grad_norm": 0.5140274359374548, "learning_rate": 6.15834954325593e-06, "loss": 0.0619, "step": 9613 }, { "epoch": 0.7484988565033331, "grad_norm": 0.4536141156205786, "learning_rate": 6.1547467975997865e-06, "loss": 0.0522, "step": 9614 }, { "epoch": 0.7485767115955428, "grad_norm": 0.47019596588418455, "learning_rate": 6.151144914437837e-06, "loss": 0.0415, "step": 9615 }, { "epoch": 0.7486545666877524, "grad_norm": 0.49323273489776404, "learning_rate": 6.147543893994463e-06, "loss": 0.05, "step": 9616 }, { "epoch": 0.748732421779962, "grad_norm": 0.46153483257128036, "learning_rate": 6.143943736494001e-06, "loss": 0.0399, "step": 9617 }, { "epoch": 0.7488102768721717, "grad_norm": 0.3908246617681855, "learning_rate": 6.140344442160702e-06, "loss": 0.0298, "step": 9618 }, { "epoch": 0.7488881319643813, "grad_norm": 0.5048533144750242, "learning_rate": 6.136746011218804e-06, "loss": 0.0588, "step": 9619 }, { "epoch": 0.7489659870565909, "grad_norm": 0.559059267861571, "learning_rate": 6.133148443892465e-06, "loss": 0.068, "step": 9620 }, { "epoch": 0.7490438421488006, "grad_norm": 0.44129623492168535, "learning_rate": 6.1295517404057995e-06, "loss": 0.0412, "step": 9621 }, { "epoch": 0.7491216972410102, "grad_norm": 0.4858589562346196, "learning_rate": 6.125955900982863e-06, "loss": 0.0583, "step": 9622 }, { "epoch": 0.7491995523332198, "grad_norm": 0.5489461818441621, "learning_rate": 6.122360925847653e-06, "loss": 0.0425, "step": 9623 }, { "epoch": 0.7492774074254294, "grad_norm": 0.5269433500241931, "learning_rate": 6.118766815224135e-06, "loss": 0.0522, "step": 9624 }, { "epoch": 0.7493552625176391, "grad_norm": 0.5336562373732934, "learning_rate": 6.115173569336195e-06, "loss": 0.0509, "step": 9625 }, { "epoch": 0.7494331176098487, "grad_norm": 0.4479436096504012, "learning_rate": 6.111581188407685e-06, "loss": 0.0419, "step": 9626 }, { "epoch": 0.7495109727020582, "grad_norm": 0.39197596438611565, "learning_rate": 6.107989672662378e-06, "loss": 0.0338, "step": 9627 }, { "epoch": 0.7495888277942679, "grad_norm": 0.3800025237646412, "learning_rate": 6.10439902232401e-06, "loss": 0.0263, "step": 9628 }, { "epoch": 0.7496666828864775, "grad_norm": 0.5475337004131829, "learning_rate": 6.100809237616276e-06, "loss": 0.0587, "step": 9629 }, { "epoch": 0.7497445379786871, "grad_norm": 0.5022335613651151, "learning_rate": 6.097220318762793e-06, "loss": 0.0652, "step": 9630 }, { "epoch": 0.7498223930708968, "grad_norm": 0.34729011036476765, "learning_rate": 6.093632265987135e-06, "loss": 0.0272, "step": 9631 }, { "epoch": 0.7499002481631064, "grad_norm": 0.4621996335142977, "learning_rate": 6.090045079512821e-06, "loss": 0.0463, "step": 9632 }, { "epoch": 0.749978103255316, "grad_norm": 0.4499020690256305, "learning_rate": 6.086458759563316e-06, "loss": 0.0407, "step": 9633 }, { "epoch": 0.7500559583475257, "grad_norm": 0.4713336402123186, "learning_rate": 6.082873306362027e-06, "loss": 0.0474, "step": 9634 }, { "epoch": 0.7501338134397353, "grad_norm": 0.5331866689783168, "learning_rate": 6.079288720132319e-06, "loss": 0.0589, "step": 9635 }, { "epoch": 0.7502116685319449, "grad_norm": 0.540267995390373, "learning_rate": 6.075705001097496e-06, "loss": 0.0662, "step": 9636 }, { "epoch": 0.7502895236241546, "grad_norm": 0.6506564386188368, "learning_rate": 6.072122149480797e-06, "loss": 0.0718, "step": 9637 }, { "epoch": 0.7503673787163642, "grad_norm": 0.5326503693064459, "learning_rate": 6.068540165505421e-06, "loss": 0.0549, "step": 9638 }, { "epoch": 0.7504452338085738, "grad_norm": 0.4710573948470044, "learning_rate": 6.064959049394501e-06, "loss": 0.0485, "step": 9639 }, { "epoch": 0.7505230889007835, "grad_norm": 0.3808049444573647, "learning_rate": 6.061378801371141e-06, "loss": 0.0304, "step": 9640 }, { "epoch": 0.750600943992993, "grad_norm": 0.5444336088364672, "learning_rate": 6.057799421658362e-06, "loss": 0.0693, "step": 9641 }, { "epoch": 0.7506787990852026, "grad_norm": 0.5317849791719083, "learning_rate": 6.054220910479145e-06, "loss": 0.0674, "step": 9642 }, { "epoch": 0.7507566541774123, "grad_norm": 0.3845904146602698, "learning_rate": 6.050643268056416e-06, "loss": 0.0325, "step": 9643 }, { "epoch": 0.7508345092696219, "grad_norm": 0.47345619060233096, "learning_rate": 6.047066494613037e-06, "loss": 0.0549, "step": 9644 }, { "epoch": 0.7509123643618315, "grad_norm": 0.4336566341142634, "learning_rate": 6.043490590371841e-06, "loss": 0.0451, "step": 9645 }, { "epoch": 0.7509902194540412, "grad_norm": 0.4291016826974759, "learning_rate": 6.039915555555576e-06, "loss": 0.0455, "step": 9646 }, { "epoch": 0.7510680745462508, "grad_norm": 0.6173936614727862, "learning_rate": 6.036341390386953e-06, "loss": 0.0841, "step": 9647 }, { "epoch": 0.7511459296384604, "grad_norm": 0.4770149177254477, "learning_rate": 6.032768095088626e-06, "loss": 0.0502, "step": 9648 }, { "epoch": 0.7512237847306701, "grad_norm": 0.5673498062060802, "learning_rate": 6.029195669883188e-06, "loss": 0.0788, "step": 9649 }, { "epoch": 0.7513016398228797, "grad_norm": 0.543850865295006, "learning_rate": 6.025624114993198e-06, "loss": 0.0659, "step": 9650 }, { "epoch": 0.7513016398228797, "eval_loss": 0.006547936238348484, "eval_runtime": 166.947, "eval_samples_per_second": 17.251, "eval_steps_per_second": 0.617, "step": 9650 }, { "epoch": 0.7513794949150893, "grad_norm": 0.43825121047233034, "learning_rate": 6.0220534306411414e-06, "loss": 0.0432, "step": 9651 }, { "epoch": 0.751457350007299, "grad_norm": 0.4421546572688713, "learning_rate": 6.018483617049453e-06, "loss": 0.0341, "step": 9652 }, { "epoch": 0.7515352050995086, "grad_norm": 0.44397546922128445, "learning_rate": 6.0149146744405176e-06, "loss": 0.0421, "step": 9653 }, { "epoch": 0.7516130601917181, "grad_norm": 0.45655703028272726, "learning_rate": 6.0113466030366606e-06, "loss": 0.043, "step": 9654 }, { "epoch": 0.7516909152839278, "grad_norm": 0.533519137143137, "learning_rate": 6.0077794030601565e-06, "loss": 0.0598, "step": 9655 }, { "epoch": 0.7517687703761374, "grad_norm": 0.42578654994552906, "learning_rate": 6.00421307473323e-06, "loss": 0.0352, "step": 9656 }, { "epoch": 0.751846625468347, "grad_norm": 0.48902360889953755, "learning_rate": 6.000647618278039e-06, "loss": 0.0508, "step": 9657 }, { "epoch": 0.7519244805605567, "grad_norm": 0.44354630221651054, "learning_rate": 5.997083033916702e-06, "loss": 0.0431, "step": 9658 }, { "epoch": 0.7520023356527663, "grad_norm": 0.5079832696278034, "learning_rate": 5.9935193218712636e-06, "loss": 0.0564, "step": 9659 }, { "epoch": 0.7520801907449759, "grad_norm": 0.604806032781654, "learning_rate": 5.989956482363744e-06, "loss": 0.0716, "step": 9660 }, { "epoch": 0.7521580458371855, "grad_norm": 0.5160099984744188, "learning_rate": 5.986394515616081e-06, "loss": 0.0518, "step": 9661 }, { "epoch": 0.7522359009293952, "grad_norm": 0.4167484202108892, "learning_rate": 5.982833421850169e-06, "loss": 0.0366, "step": 9662 }, { "epoch": 0.7523137560216048, "grad_norm": 0.47367240404003524, "learning_rate": 5.979273201287852e-06, "loss": 0.0544, "step": 9663 }, { "epoch": 0.7523916111138144, "grad_norm": 0.48809846197703166, "learning_rate": 5.975713854150909e-06, "loss": 0.0577, "step": 9664 }, { "epoch": 0.7524694662060241, "grad_norm": 0.35955052511874896, "learning_rate": 5.972155380661071e-06, "loss": 0.0316, "step": 9665 }, { "epoch": 0.7525473212982337, "grad_norm": 0.39555608662452946, "learning_rate": 5.9685977810400175e-06, "loss": 0.0273, "step": 9666 }, { "epoch": 0.7526251763904432, "grad_norm": 0.4236864395419718, "learning_rate": 5.965041055509371e-06, "loss": 0.0313, "step": 9667 }, { "epoch": 0.7527030314826529, "grad_norm": 0.4454036194584173, "learning_rate": 5.961485204290694e-06, "loss": 0.0472, "step": 9668 }, { "epoch": 0.7527808865748625, "grad_norm": 0.4462338651779455, "learning_rate": 5.957930227605502e-06, "loss": 0.0395, "step": 9669 }, { "epoch": 0.7528587416670721, "grad_norm": 0.35002536498373676, "learning_rate": 5.954376125675248e-06, "loss": 0.0264, "step": 9670 }, { "epoch": 0.7529365967592818, "grad_norm": 0.5370365164655244, "learning_rate": 5.9508228987213465e-06, "loss": 0.0544, "step": 9671 }, { "epoch": 0.7530144518514914, "grad_norm": 0.4490292110048132, "learning_rate": 5.947270546965143e-06, "loss": 0.036, "step": 9672 }, { "epoch": 0.753092306943701, "grad_norm": 0.5429362945102008, "learning_rate": 5.943719070627929e-06, "loss": 0.0672, "step": 9673 }, { "epoch": 0.7531701620359107, "grad_norm": 0.4757814575525971, "learning_rate": 5.940168469930947e-06, "loss": 0.0469, "step": 9674 }, { "epoch": 0.7532480171281203, "grad_norm": 0.4364855367486896, "learning_rate": 5.936618745095384e-06, "loss": 0.0317, "step": 9675 }, { "epoch": 0.7533258722203299, "grad_norm": 0.49766771042910574, "learning_rate": 5.933069896342367e-06, "loss": 0.0443, "step": 9676 }, { "epoch": 0.7534037273125396, "grad_norm": 0.4974605583404809, "learning_rate": 5.929521923892976e-06, "loss": 0.0443, "step": 9677 }, { "epoch": 0.7534815824047492, "grad_norm": 0.4185161695957384, "learning_rate": 5.925974827968233e-06, "loss": 0.0341, "step": 9678 }, { "epoch": 0.7535594374969588, "grad_norm": 0.4755777003358197, "learning_rate": 5.922428608789103e-06, "loss": 0.0511, "step": 9679 }, { "epoch": 0.7536372925891685, "grad_norm": 0.42007023079707434, "learning_rate": 5.918883266576494e-06, "loss": 0.03, "step": 9680 }, { "epoch": 0.753715147681378, "grad_norm": 0.41505094388056923, "learning_rate": 5.915338801551278e-06, "loss": 0.0308, "step": 9681 }, { "epoch": 0.7537930027735876, "grad_norm": 0.4927278017024213, "learning_rate": 5.9117952139342504e-06, "loss": 0.0461, "step": 9682 }, { "epoch": 0.7538708578657973, "grad_norm": 0.6976276349920459, "learning_rate": 5.9082525039461595e-06, "loss": 0.0853, "step": 9683 }, { "epoch": 0.7539487129580069, "grad_norm": 0.5685347828787629, "learning_rate": 5.904710671807708e-06, "loss": 0.0751, "step": 9684 }, { "epoch": 0.7540265680502165, "grad_norm": 0.5944207664220348, "learning_rate": 5.9011697177395145e-06, "loss": 0.0709, "step": 9685 }, { "epoch": 0.7541044231424262, "grad_norm": 0.536322502914278, "learning_rate": 5.897629641962184e-06, "loss": 0.0606, "step": 9686 }, { "epoch": 0.7541822782346358, "grad_norm": 0.5273368844906683, "learning_rate": 5.894090444696241e-06, "loss": 0.0546, "step": 9687 }, { "epoch": 0.7542601333268454, "grad_norm": 0.6314153952660593, "learning_rate": 5.890552126162157e-06, "loss": 0.0868, "step": 9688 }, { "epoch": 0.7543379884190551, "grad_norm": 0.4833154603269092, "learning_rate": 5.887014686580357e-06, "loss": 0.0496, "step": 9689 }, { "epoch": 0.7544158435112647, "grad_norm": 0.5115222034601576, "learning_rate": 5.883478126171198e-06, "loss": 0.0522, "step": 9690 }, { "epoch": 0.7544936986034743, "grad_norm": 0.4489152712610594, "learning_rate": 5.879942445155007e-06, "loss": 0.0503, "step": 9691 }, { "epoch": 0.754571553695684, "grad_norm": 0.4009227959787337, "learning_rate": 5.876407643752029e-06, "loss": 0.0411, "step": 9692 }, { "epoch": 0.7546494087878935, "grad_norm": 0.6370266364600699, "learning_rate": 5.872873722182475e-06, "loss": 0.0942, "step": 9693 }, { "epoch": 0.7547272638801031, "grad_norm": 0.5544009251972462, "learning_rate": 5.869340680666478e-06, "loss": 0.0634, "step": 9694 }, { "epoch": 0.7548051189723127, "grad_norm": 0.46707075588540103, "learning_rate": 5.86580851942413e-06, "loss": 0.0485, "step": 9695 }, { "epoch": 0.7548829740645224, "grad_norm": 0.34343559018580244, "learning_rate": 5.862277238675482e-06, "loss": 0.0261, "step": 9696 }, { "epoch": 0.754960829156732, "grad_norm": 0.64415851228535, "learning_rate": 5.8587468386405096e-06, "loss": 0.0802, "step": 9697 }, { "epoch": 0.7550386842489416, "grad_norm": 0.4371886383858318, "learning_rate": 5.8552173195391374e-06, "loss": 0.0428, "step": 9698 }, { "epoch": 0.7551165393411513, "grad_norm": 0.5497002901591322, "learning_rate": 5.851688681591241e-06, "loss": 0.0563, "step": 9699 }, { "epoch": 0.7551943944333609, "grad_norm": 0.5114447328220963, "learning_rate": 5.8481609250166304e-06, "loss": 0.0531, "step": 9700 }, { "epoch": 0.7551943944333609, "eval_loss": 0.006529796868562698, "eval_runtime": 167.33, "eval_samples_per_second": 17.211, "eval_steps_per_second": 0.616, "step": 9700 }, { "epoch": 0.7552722495255705, "grad_norm": 0.38228814047743465, "learning_rate": 5.84463405003508e-06, "loss": 0.0351, "step": 9701 }, { "epoch": 0.7553501046177802, "grad_norm": 0.46422471320635744, "learning_rate": 5.841108056866301e-06, "loss": 0.041, "step": 9702 }, { "epoch": 0.7554279597099898, "grad_norm": 0.4767319724637448, "learning_rate": 5.837582945729929e-06, "loss": 0.0486, "step": 9703 }, { "epoch": 0.7555058148021994, "grad_norm": 0.44174447209237416, "learning_rate": 5.83405871684557e-06, "loss": 0.0423, "step": 9704 }, { "epoch": 0.7555836698944091, "grad_norm": 0.5624311839505164, "learning_rate": 5.8305353704327615e-06, "loss": 0.0694, "step": 9705 }, { "epoch": 0.7556615249866186, "grad_norm": 0.4598683527746919, "learning_rate": 5.827012906711005e-06, "loss": 0.0548, "step": 9706 }, { "epoch": 0.7557393800788282, "grad_norm": 0.4465905184709668, "learning_rate": 5.8234913258997235e-06, "loss": 0.0535, "step": 9707 }, { "epoch": 0.7558172351710379, "grad_norm": 0.3348447536599247, "learning_rate": 5.819970628218296e-06, "loss": 0.0219, "step": 9708 }, { "epoch": 0.7558950902632475, "grad_norm": 0.43942605385227673, "learning_rate": 5.816450813886048e-06, "loss": 0.0402, "step": 9709 }, { "epoch": 0.7559729453554571, "grad_norm": 0.4193572872791139, "learning_rate": 5.812931883122246e-06, "loss": 0.0518, "step": 9710 }, { "epoch": 0.7560508004476668, "grad_norm": 0.4048793748522417, "learning_rate": 5.809413836146103e-06, "loss": 0.0447, "step": 9711 }, { "epoch": 0.7561286555398764, "grad_norm": 0.4332635537678726, "learning_rate": 5.805896673176774e-06, "loss": 0.0318, "step": 9712 }, { "epoch": 0.756206510632086, "grad_norm": 0.4915503274844056, "learning_rate": 5.8023803944333645e-06, "loss": 0.0491, "step": 9713 }, { "epoch": 0.7562843657242957, "grad_norm": 0.5268662260787728, "learning_rate": 5.798865000134924e-06, "loss": 0.045, "step": 9714 }, { "epoch": 0.7563622208165053, "grad_norm": 0.38444461723267687, "learning_rate": 5.79535049050044e-06, "loss": 0.0307, "step": 9715 }, { "epoch": 0.7564400759087149, "grad_norm": 0.48854379224343175, "learning_rate": 5.791836865748848e-06, "loss": 0.0464, "step": 9716 }, { "epoch": 0.7565179310009246, "grad_norm": 0.4516875185876404, "learning_rate": 5.788324126099041e-06, "loss": 0.054, "step": 9717 }, { "epoch": 0.7565957860931342, "grad_norm": 0.4717967266192393, "learning_rate": 5.784812271769842e-06, "loss": 0.0403, "step": 9718 }, { "epoch": 0.7566736411853437, "grad_norm": 0.48594550969559924, "learning_rate": 5.781301302980018e-06, "loss": 0.0574, "step": 9719 }, { "epoch": 0.7567514962775534, "grad_norm": 0.42144577678997763, "learning_rate": 5.77779121994829e-06, "loss": 0.0385, "step": 9720 }, { "epoch": 0.756829351369763, "grad_norm": 0.4256764761427309, "learning_rate": 5.774282022893321e-06, "loss": 0.041, "step": 9721 }, { "epoch": 0.7569072064619726, "grad_norm": 0.5343774836616767, "learning_rate": 5.770773712033713e-06, "loss": 0.0525, "step": 9722 }, { "epoch": 0.7569850615541823, "grad_norm": 0.5204759322666028, "learning_rate": 5.767266287588018e-06, "loss": 0.0603, "step": 9723 }, { "epoch": 0.7570629166463919, "grad_norm": 0.5066138845288432, "learning_rate": 5.7637597497747355e-06, "loss": 0.0576, "step": 9724 }, { "epoch": 0.7571407717386015, "grad_norm": 0.551366393397248, "learning_rate": 5.760254098812305e-06, "loss": 0.0607, "step": 9725 }, { "epoch": 0.7572186268308112, "grad_norm": 0.5709081637345307, "learning_rate": 5.756749334919105e-06, "loss": 0.0758, "step": 9726 }, { "epoch": 0.7572964819230208, "grad_norm": 0.46983143210313655, "learning_rate": 5.7532454583134765e-06, "loss": 0.0497, "step": 9727 }, { "epoch": 0.7573743370152304, "grad_norm": 0.46617471814024364, "learning_rate": 5.749742469213693e-06, "loss": 0.0497, "step": 9728 }, { "epoch": 0.7574521921074401, "grad_norm": 0.43024609287551485, "learning_rate": 5.746240367837969e-06, "loss": 0.0339, "step": 9729 }, { "epoch": 0.7575300471996497, "grad_norm": 0.533790704654163, "learning_rate": 5.742739154404473e-06, "loss": 0.0432, "step": 9730 }, { "epoch": 0.7576079022918593, "grad_norm": 0.4883182549146291, "learning_rate": 5.739238829131313e-06, "loss": 0.0546, "step": 9731 }, { "epoch": 0.7576857573840688, "grad_norm": 0.43947526801620623, "learning_rate": 5.735739392236541e-06, "loss": 0.04, "step": 9732 }, { "epoch": 0.7577636124762785, "grad_norm": 0.4582942282116138, "learning_rate": 5.7322408439381576e-06, "loss": 0.0445, "step": 9733 }, { "epoch": 0.7578414675684881, "grad_norm": 0.5170143948272853, "learning_rate": 5.728743184454106e-06, "loss": 0.0592, "step": 9734 }, { "epoch": 0.7579193226606977, "grad_norm": 0.6206774993469225, "learning_rate": 5.7252464140022724e-06, "loss": 0.0743, "step": 9735 }, { "epoch": 0.7579971777529074, "grad_norm": 0.5654257251314698, "learning_rate": 5.721750532800483e-06, "loss": 0.0522, "step": 9736 }, { "epoch": 0.758075032845117, "grad_norm": 0.4969722449759908, "learning_rate": 5.718255541066529e-06, "loss": 0.0595, "step": 9737 }, { "epoch": 0.7581528879373266, "grad_norm": 0.5117537956607858, "learning_rate": 5.714761439018124e-06, "loss": 0.0599, "step": 9738 }, { "epoch": 0.7582307430295363, "grad_norm": 0.45706948174441364, "learning_rate": 5.711268226872937e-06, "loss": 0.0537, "step": 9739 }, { "epoch": 0.7583085981217459, "grad_norm": 0.47642649918426944, "learning_rate": 5.707775904848573e-06, "loss": 0.0449, "step": 9740 }, { "epoch": 0.7583864532139555, "grad_norm": 0.5515747561323319, "learning_rate": 5.7042844731625935e-06, "loss": 0.0639, "step": 9741 }, { "epoch": 0.7584643083061652, "grad_norm": 0.46718356650175896, "learning_rate": 5.700793932032493e-06, "loss": 0.0471, "step": 9742 }, { "epoch": 0.7585421633983748, "grad_norm": 0.49853172519505357, "learning_rate": 5.697304281675722e-06, "loss": 0.0523, "step": 9743 }, { "epoch": 0.7586200184905844, "grad_norm": 0.4399189095052358, "learning_rate": 5.693815522309662e-06, "loss": 0.0447, "step": 9744 }, { "epoch": 0.7586978735827941, "grad_norm": 0.5140505145515576, "learning_rate": 5.690327654151653e-06, "loss": 0.0426, "step": 9745 }, { "epoch": 0.7587757286750036, "grad_norm": 0.6327513564614289, "learning_rate": 5.6868406774189675e-06, "loss": 0.0653, "step": 9746 }, { "epoch": 0.7588535837672132, "grad_norm": 0.4370077225577197, "learning_rate": 5.683354592328825e-06, "loss": 0.0523, "step": 9747 }, { "epoch": 0.7589314388594229, "grad_norm": 0.409218552099743, "learning_rate": 5.679869399098401e-06, "loss": 0.0424, "step": 9748 }, { "epoch": 0.7590092939516325, "grad_norm": 0.47041656242791213, "learning_rate": 5.676385097944805e-06, "loss": 0.0474, "step": 9749 }, { "epoch": 0.7590871490438421, "grad_norm": 0.5677503491090486, "learning_rate": 5.67290168908509e-06, "loss": 0.0605, "step": 9750 }, { "epoch": 0.7590871490438421, "eval_loss": 0.006493640597909689, "eval_runtime": 167.276, "eval_samples_per_second": 17.217, "eval_steps_per_second": 0.616, "step": 9750 }, { "epoch": 0.7591650041360518, "grad_norm": 0.3518747472725796, "learning_rate": 5.669419172736264e-06, "loss": 0.023, "step": 9751 }, { "epoch": 0.7592428592282614, "grad_norm": 0.3973592826248844, "learning_rate": 5.66593754911525e-06, "loss": 0.0299, "step": 9752 }, { "epoch": 0.759320714320471, "grad_norm": 0.46725940433324137, "learning_rate": 5.662456818438957e-06, "loss": 0.0465, "step": 9753 }, { "epoch": 0.7593985694126807, "grad_norm": 0.5159679001239886, "learning_rate": 5.6589769809242115e-06, "loss": 0.0618, "step": 9754 }, { "epoch": 0.7594764245048903, "grad_norm": 0.4077487616658878, "learning_rate": 5.655498036787792e-06, "loss": 0.0306, "step": 9755 }, { "epoch": 0.7595542795970999, "grad_norm": 0.4727743872100946, "learning_rate": 5.652019986246419e-06, "loss": 0.0545, "step": 9756 }, { "epoch": 0.7596321346893096, "grad_norm": 0.5070922852974185, "learning_rate": 5.648542829516754e-06, "loss": 0.0481, "step": 9757 }, { "epoch": 0.7597099897815192, "grad_norm": 0.4183089467827372, "learning_rate": 5.64506656681542e-06, "loss": 0.0324, "step": 9758 }, { "epoch": 0.7597878448737287, "grad_norm": 0.48130400198420653, "learning_rate": 5.641591198358962e-06, "loss": 0.0526, "step": 9759 }, { "epoch": 0.7598656999659384, "grad_norm": 0.3956611826960798, "learning_rate": 5.638116724363891e-06, "loss": 0.0366, "step": 9760 }, { "epoch": 0.759943555058148, "grad_norm": 0.45388495814488367, "learning_rate": 5.634643145046632e-06, "loss": 0.0466, "step": 9761 }, { "epoch": 0.7600214101503576, "grad_norm": 0.5710789849986838, "learning_rate": 5.631170460623578e-06, "loss": 0.0684, "step": 9762 }, { "epoch": 0.7600992652425673, "grad_norm": 0.3265806920289823, "learning_rate": 5.627698671311071e-06, "loss": 0.018, "step": 9763 }, { "epoch": 0.7601771203347769, "grad_norm": 0.48006070128667166, "learning_rate": 5.62422777732538e-06, "loss": 0.0507, "step": 9764 }, { "epoch": 0.7602549754269865, "grad_norm": 0.5861212557719395, "learning_rate": 5.6207577788827265e-06, "loss": 0.0554, "step": 9765 }, { "epoch": 0.7603328305191962, "grad_norm": 0.4607929980560627, "learning_rate": 5.617288676199275e-06, "loss": 0.0504, "step": 9766 }, { "epoch": 0.7604106856114058, "grad_norm": 0.41634434031149703, "learning_rate": 5.613820469491129e-06, "loss": 0.0329, "step": 9767 }, { "epoch": 0.7604885407036154, "grad_norm": 0.43904857308615397, "learning_rate": 5.610353158974353e-06, "loss": 0.0332, "step": 9768 }, { "epoch": 0.760566395795825, "grad_norm": 0.401990233185089, "learning_rate": 5.606886744864943e-06, "loss": 0.029, "step": 9769 }, { "epoch": 0.7606442508880347, "grad_norm": 0.4750557211950296, "learning_rate": 5.603421227378829e-06, "loss": 0.0581, "step": 9770 }, { "epoch": 0.7607221059802443, "grad_norm": 0.35440212724194786, "learning_rate": 5.599956606731903e-06, "loss": 0.0273, "step": 9771 }, { "epoch": 0.7607999610724538, "grad_norm": 0.49294964277375036, "learning_rate": 5.596492883139988e-06, "loss": 0.0446, "step": 9772 }, { "epoch": 0.7608778161646635, "grad_norm": 0.5449914889315938, "learning_rate": 5.593030056818871e-06, "loss": 0.0461, "step": 9773 }, { "epoch": 0.7609556712568731, "grad_norm": 0.47365118095234465, "learning_rate": 5.5895681279842615e-06, "loss": 0.0524, "step": 9774 }, { "epoch": 0.7610335263490827, "grad_norm": 0.5776597882609678, "learning_rate": 5.586107096851823e-06, "loss": 0.0756, "step": 9775 }, { "epoch": 0.7611113814412924, "grad_norm": 0.5277926901883636, "learning_rate": 5.582646963637159e-06, "loss": 0.0554, "step": 9776 }, { "epoch": 0.761189236533502, "grad_norm": 0.48779378464318407, "learning_rate": 5.579187728555819e-06, "loss": 0.0537, "step": 9777 }, { "epoch": 0.7612670916257116, "grad_norm": 0.5673210375329797, "learning_rate": 5.57572939182331e-06, "loss": 0.0574, "step": 9778 }, { "epoch": 0.7613449467179213, "grad_norm": 0.5199831946003497, "learning_rate": 5.572271953655055e-06, "loss": 0.058, "step": 9779 }, { "epoch": 0.7614228018101309, "grad_norm": 0.46174517978805735, "learning_rate": 5.5688154142664395e-06, "loss": 0.048, "step": 9780 }, { "epoch": 0.7615006569023405, "grad_norm": 0.41437507162427506, "learning_rate": 5.565359773872792e-06, "loss": 0.0351, "step": 9781 }, { "epoch": 0.7615785119945502, "grad_norm": 0.5868952145048435, "learning_rate": 5.561905032689376e-06, "loss": 0.0682, "step": 9782 }, { "epoch": 0.7616563670867598, "grad_norm": 0.4114460567009037, "learning_rate": 5.558451190931416e-06, "loss": 0.0365, "step": 9783 }, { "epoch": 0.7617342221789694, "grad_norm": 0.5435655204783876, "learning_rate": 5.554998248814068e-06, "loss": 0.0701, "step": 9784 }, { "epoch": 0.7618120772711791, "grad_norm": 0.5438911120137383, "learning_rate": 5.551546206552429e-06, "loss": 0.0589, "step": 9785 }, { "epoch": 0.7618899323633886, "grad_norm": 0.5332737889964724, "learning_rate": 5.548095064361549e-06, "loss": 0.0577, "step": 9786 }, { "epoch": 0.7619677874555982, "grad_norm": 0.4665339644147825, "learning_rate": 5.544644822456416e-06, "loss": 0.0441, "step": 9787 }, { "epoch": 0.7620456425478079, "grad_norm": 0.36343898514121287, "learning_rate": 5.541195481051964e-06, "loss": 0.0344, "step": 9788 }, { "epoch": 0.7621234976400175, "grad_norm": 0.5068548382689272, "learning_rate": 5.537747040363073e-06, "loss": 0.0626, "step": 9789 }, { "epoch": 0.7622013527322271, "grad_norm": 0.5333024782687783, "learning_rate": 5.5342995006045604e-06, "loss": 0.0717, "step": 9790 }, { "epoch": 0.7622792078244368, "grad_norm": 0.38527827656114516, "learning_rate": 5.530852861991196e-06, "loss": 0.0299, "step": 9791 }, { "epoch": 0.7623570629166464, "grad_norm": 0.47522035528350054, "learning_rate": 5.527407124737687e-06, "loss": 0.0408, "step": 9792 }, { "epoch": 0.762434918008856, "grad_norm": 0.3956687299676656, "learning_rate": 5.52396228905868e-06, "loss": 0.033, "step": 9793 }, { "epoch": 0.7625127731010657, "grad_norm": 0.47875123836540984, "learning_rate": 5.5205183551687846e-06, "loss": 0.0627, "step": 9794 }, { "epoch": 0.7625906281932753, "grad_norm": 0.4789597502109002, "learning_rate": 5.517075323282537e-06, "loss": 0.0649, "step": 9795 }, { "epoch": 0.7626684832854849, "grad_norm": 0.428923601002517, "learning_rate": 5.513633193614423e-06, "loss": 0.049, "step": 9796 }, { "epoch": 0.7627463383776946, "grad_norm": 0.4275434619900992, "learning_rate": 5.510191966378866e-06, "loss": 0.0416, "step": 9797 }, { "epoch": 0.7628241934699042, "grad_norm": 0.41373813756624794, "learning_rate": 5.5067516417902425e-06, "loss": 0.0464, "step": 9798 }, { "epoch": 0.7629020485621137, "grad_norm": 0.44876217088363424, "learning_rate": 5.503312220062866e-06, "loss": 0.0491, "step": 9799 }, { "epoch": 0.7629799036543234, "grad_norm": 0.46996716151922374, "learning_rate": 5.4998737014110006e-06, "loss": 0.0456, "step": 9800 }, { "epoch": 0.7629799036543234, "eval_loss": 0.00643531046807766, "eval_runtime": 162.5394, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.634, "step": 9800 }, { "epoch": 0.763057758746533, "grad_norm": 0.5305373021864023, "learning_rate": 5.496436086048846e-06, "loss": 0.0582, "step": 9801 }, { "epoch": 0.7631356138387426, "grad_norm": 0.4289631915006659, "learning_rate": 5.492999374190549e-06, "loss": 0.0391, "step": 9802 }, { "epoch": 0.7632134689309522, "grad_norm": 0.6147048704541168, "learning_rate": 5.489563566050198e-06, "loss": 0.0693, "step": 9803 }, { "epoch": 0.7632913240231619, "grad_norm": 0.44830890492890574, "learning_rate": 5.486128661841836e-06, "loss": 0.0401, "step": 9804 }, { "epoch": 0.7633691791153715, "grad_norm": 0.4513651697753399, "learning_rate": 5.482694661779437e-06, "loss": 0.0441, "step": 9805 }, { "epoch": 0.7634470342075811, "grad_norm": 0.4916823072520348, "learning_rate": 5.479261566076926e-06, "loss": 0.0656, "step": 9806 }, { "epoch": 0.7635248892997908, "grad_norm": 0.39467798857998004, "learning_rate": 5.475829374948163e-06, "loss": 0.0373, "step": 9807 }, { "epoch": 0.7636027443920004, "grad_norm": 0.44849714228067156, "learning_rate": 5.472398088606963e-06, "loss": 0.0389, "step": 9808 }, { "epoch": 0.76368059948421, "grad_norm": 0.44218662997066105, "learning_rate": 5.468967707267074e-06, "loss": 0.0384, "step": 9809 }, { "epoch": 0.7637584545764197, "grad_norm": 0.4795029009003116, "learning_rate": 5.4655382311421955e-06, "loss": 0.0581, "step": 9810 }, { "epoch": 0.7638363096686293, "grad_norm": 0.46877680323815035, "learning_rate": 5.462109660445967e-06, "loss": 0.0475, "step": 9811 }, { "epoch": 0.7639141647608388, "grad_norm": 0.5582830741946364, "learning_rate": 5.458681995391975e-06, "loss": 0.0611, "step": 9812 }, { "epoch": 0.7639920198530485, "grad_norm": 0.47294020332465714, "learning_rate": 5.455255236193735e-06, "loss": 0.0504, "step": 9813 }, { "epoch": 0.7640698749452581, "grad_norm": 0.3994694986386326, "learning_rate": 5.451829383064735e-06, "loss": 0.0352, "step": 9814 }, { "epoch": 0.7641477300374677, "grad_norm": 0.4426831194689512, "learning_rate": 5.448404436218382e-06, "loss": 0.046, "step": 9815 }, { "epoch": 0.7642255851296774, "grad_norm": 0.3437079443459583, "learning_rate": 5.444980395868034e-06, "loss": 0.0186, "step": 9816 }, { "epoch": 0.764303440221887, "grad_norm": 0.4312258398463234, "learning_rate": 5.441557262226993e-06, "loss": 0.0453, "step": 9817 }, { "epoch": 0.7643812953140966, "grad_norm": 0.4004635255617844, "learning_rate": 5.4381350355085025e-06, "loss": 0.029, "step": 9818 }, { "epoch": 0.7644591504063063, "grad_norm": 0.5569003764982126, "learning_rate": 5.434713715925754e-06, "loss": 0.0738, "step": 9819 }, { "epoch": 0.7645370054985159, "grad_norm": 0.5356623672132007, "learning_rate": 5.431293303691878e-06, "loss": 0.0609, "step": 9820 }, { "epoch": 0.7646148605907255, "grad_norm": 0.4514373092987019, "learning_rate": 5.427873799019949e-06, "loss": 0.0446, "step": 9821 }, { "epoch": 0.7646927156829352, "grad_norm": 0.48053086277253454, "learning_rate": 5.4244552021229894e-06, "loss": 0.0374, "step": 9822 }, { "epoch": 0.7647705707751448, "grad_norm": 0.43303016066870514, "learning_rate": 5.421037513213956e-06, "loss": 0.0413, "step": 9823 }, { "epoch": 0.7648484258673544, "grad_norm": 0.4214729551753579, "learning_rate": 5.417620732505755e-06, "loss": 0.0372, "step": 9824 }, { "epoch": 0.764926280959564, "grad_norm": 0.5072726265614821, "learning_rate": 5.414204860211245e-06, "loss": 0.0387, "step": 9825 }, { "epoch": 0.7650041360517736, "grad_norm": 0.3982118305568456, "learning_rate": 5.410789896543214e-06, "loss": 0.0367, "step": 9826 }, { "epoch": 0.7650819911439832, "grad_norm": 0.490268316845599, "learning_rate": 5.407375841714402e-06, "loss": 0.0452, "step": 9827 }, { "epoch": 0.7651598462361929, "grad_norm": 0.44382020459849386, "learning_rate": 5.403962695937477e-06, "loss": 0.0428, "step": 9828 }, { "epoch": 0.7652377013284025, "grad_norm": 0.48268289960055694, "learning_rate": 5.400550459425063e-06, "loss": 0.0502, "step": 9829 }, { "epoch": 0.7653155564206121, "grad_norm": 0.5594388521800086, "learning_rate": 5.397139132389737e-06, "loss": 0.0473, "step": 9830 }, { "epoch": 0.7653934115128218, "grad_norm": 0.4144419558218941, "learning_rate": 5.393728715044004e-06, "loss": 0.0324, "step": 9831 }, { "epoch": 0.7654712666050314, "grad_norm": 0.3047466222418488, "learning_rate": 5.390319207600317e-06, "loss": 0.0179, "step": 9832 }, { "epoch": 0.765549121697241, "grad_norm": 0.3808052988763207, "learning_rate": 5.386910610271072e-06, "loss": 0.0242, "step": 9833 }, { "epoch": 0.7656269767894507, "grad_norm": 0.45311247730110826, "learning_rate": 5.383502923268602e-06, "loss": 0.0457, "step": 9834 }, { "epoch": 0.7657048318816603, "grad_norm": 0.566862296096603, "learning_rate": 5.380096146805203e-06, "loss": 0.0567, "step": 9835 }, { "epoch": 0.7657826869738699, "grad_norm": 0.4381792285425275, "learning_rate": 5.3766902810931e-06, "loss": 0.0408, "step": 9836 }, { "epoch": 0.7658605420660796, "grad_norm": 0.4668474196044432, "learning_rate": 5.37328532634445e-06, "loss": 0.0444, "step": 9837 }, { "epoch": 0.7659383971582892, "grad_norm": 0.5392105385196382, "learning_rate": 5.369881282771374e-06, "loss": 0.0515, "step": 9838 }, { "epoch": 0.7660162522504987, "grad_norm": 0.4036774445737191, "learning_rate": 5.366478150585919e-06, "loss": 0.0218, "step": 9839 }, { "epoch": 0.7660941073427083, "grad_norm": 0.6183649601530207, "learning_rate": 5.363075930000099e-06, "loss": 0.069, "step": 9840 }, { "epoch": 0.766171962434918, "grad_norm": 0.640505592438938, "learning_rate": 5.359674621225848e-06, "loss": 0.0784, "step": 9841 }, { "epoch": 0.7662498175271276, "grad_norm": 0.47507375316277567, "learning_rate": 5.356274224475053e-06, "loss": 0.0472, "step": 9842 }, { "epoch": 0.7663276726193372, "grad_norm": 0.46889871600382316, "learning_rate": 5.352874739959542e-06, "loss": 0.0391, "step": 9843 }, { "epoch": 0.7664055277115469, "grad_norm": 0.47978594846805456, "learning_rate": 5.349476167891081e-06, "loss": 0.0491, "step": 9844 }, { "epoch": 0.7664833828037565, "grad_norm": 0.4452155643152906, "learning_rate": 5.346078508481403e-06, "loss": 0.0474, "step": 9845 }, { "epoch": 0.7665612378959661, "grad_norm": 0.5422989459967124, "learning_rate": 5.34268176194215e-06, "loss": 0.0595, "step": 9846 }, { "epoch": 0.7666390929881758, "grad_norm": 0.40798523915677276, "learning_rate": 5.339285928484925e-06, "loss": 0.0325, "step": 9847 }, { "epoch": 0.7667169480803854, "grad_norm": 0.4023752665012334, "learning_rate": 5.335891008321277e-06, "loss": 0.0318, "step": 9848 }, { "epoch": 0.766794803172595, "grad_norm": 0.4423659720026311, "learning_rate": 5.332497001662686e-06, "loss": 0.0515, "step": 9849 }, { "epoch": 0.7668726582648047, "grad_norm": 0.4935445194742287, "learning_rate": 5.329103908720594e-06, "loss": 0.06, "step": 9850 }, { "epoch": 0.7668726582648047, "eval_loss": 0.00636960007250309, "eval_runtime": 166.0548, "eval_samples_per_second": 17.344, "eval_steps_per_second": 0.62, "step": 9850 }, { "epoch": 0.7669505133570143, "grad_norm": 0.46908893868560664, "learning_rate": 5.325711729706369e-06, "loss": 0.0522, "step": 9851 }, { "epoch": 0.7670283684492238, "grad_norm": 0.552808291633778, "learning_rate": 5.322320464831328e-06, "loss": 0.0572, "step": 9852 }, { "epoch": 0.7671062235414335, "grad_norm": 0.38491323924309667, "learning_rate": 5.318930114306731e-06, "loss": 0.039, "step": 9853 }, { "epoch": 0.7671840786336431, "grad_norm": 0.4935076727773493, "learning_rate": 5.31554067834378e-06, "loss": 0.0547, "step": 9854 }, { "epoch": 0.7672619337258527, "grad_norm": 0.3713708882701484, "learning_rate": 5.312152157153623e-06, "loss": 0.0275, "step": 9855 }, { "epoch": 0.7673397888180624, "grad_norm": 0.543028341533051, "learning_rate": 5.308764550947347e-06, "loss": 0.061, "step": 9856 }, { "epoch": 0.767417643910272, "grad_norm": 0.4822463840940873, "learning_rate": 5.305377859935985e-06, "loss": 0.0493, "step": 9857 }, { "epoch": 0.7674954990024816, "grad_norm": 0.5802608866247508, "learning_rate": 5.30199208433051e-06, "loss": 0.0675, "step": 9858 }, { "epoch": 0.7675733540946913, "grad_norm": 0.5224586560645988, "learning_rate": 5.298607224341837e-06, "loss": 0.0657, "step": 9859 }, { "epoch": 0.7676512091869009, "grad_norm": 0.41081459908205076, "learning_rate": 5.295223280180837e-06, "loss": 0.0282, "step": 9860 }, { "epoch": 0.7677290642791105, "grad_norm": 0.3375144145320659, "learning_rate": 5.291840252058307e-06, "loss": 0.0272, "step": 9861 }, { "epoch": 0.7678069193713202, "grad_norm": 0.4202324405756109, "learning_rate": 5.288458140184996e-06, "loss": 0.0496, "step": 9862 }, { "epoch": 0.7678847744635298, "grad_norm": 0.4924070844181182, "learning_rate": 5.285076944771592e-06, "loss": 0.0434, "step": 9863 }, { "epoch": 0.7679626295557394, "grad_norm": 0.4262260364255153, "learning_rate": 5.281696666028728e-06, "loss": 0.0326, "step": 9864 }, { "epoch": 0.768040484647949, "grad_norm": 0.49332608108393255, "learning_rate": 5.27831730416698e-06, "loss": 0.0508, "step": 9865 }, { "epoch": 0.7681183397401586, "grad_norm": 0.43213566061595715, "learning_rate": 5.274938859396865e-06, "loss": 0.0476, "step": 9866 }, { "epoch": 0.7681961948323682, "grad_norm": 0.47576215510070125, "learning_rate": 5.271561331928847e-06, "loss": 0.0544, "step": 9867 }, { "epoch": 0.7682740499245779, "grad_norm": 0.3936770725734558, "learning_rate": 5.268184721973329e-06, "loss": 0.0401, "step": 9868 }, { "epoch": 0.7683519050167875, "grad_norm": 0.3880751981609905, "learning_rate": 5.264809029740654e-06, "loss": 0.032, "step": 9869 }, { "epoch": 0.7684297601089971, "grad_norm": 0.4581630882490976, "learning_rate": 5.261434255441109e-06, "loss": 0.0364, "step": 9870 }, { "epoch": 0.7685076152012068, "grad_norm": 0.43222906282315177, "learning_rate": 5.258060399284941e-06, "loss": 0.0444, "step": 9871 }, { "epoch": 0.7685854702934164, "grad_norm": 0.5513013282758041, "learning_rate": 5.254687461482315e-06, "loss": 0.0535, "step": 9872 }, { "epoch": 0.768663325385626, "grad_norm": 0.5104227260978492, "learning_rate": 5.2513154422433525e-06, "loss": 0.0539, "step": 9873 }, { "epoch": 0.7687411804778357, "grad_norm": 0.4923613690958617, "learning_rate": 5.247944341778113e-06, "loss": 0.0473, "step": 9874 }, { "epoch": 0.7688190355700453, "grad_norm": 0.4181199202219999, "learning_rate": 5.244574160296598e-06, "loss": 0.047, "step": 9875 }, { "epoch": 0.7688968906622549, "grad_norm": 0.5116592766148633, "learning_rate": 5.241204898008758e-06, "loss": 0.0662, "step": 9876 }, { "epoch": 0.7689747457544645, "grad_norm": 0.454895119477685, "learning_rate": 5.237836555124481e-06, "loss": 0.0412, "step": 9877 }, { "epoch": 0.7690526008466742, "grad_norm": 0.513182681924847, "learning_rate": 5.234469131853597e-06, "loss": 0.062, "step": 9878 }, { "epoch": 0.7691304559388837, "grad_norm": 0.5520084492922701, "learning_rate": 5.231102628405882e-06, "loss": 0.0714, "step": 9879 }, { "epoch": 0.7692083110310933, "grad_norm": 0.5819252774219694, "learning_rate": 5.227737044991048e-06, "loss": 0.0807, "step": 9880 }, { "epoch": 0.769286166123303, "grad_norm": 0.4887112795159915, "learning_rate": 5.224372381818765e-06, "loss": 0.0457, "step": 9881 }, { "epoch": 0.7693640212155126, "grad_norm": 0.4773250996198332, "learning_rate": 5.221008639098633e-06, "loss": 0.0574, "step": 9882 }, { "epoch": 0.7694418763077222, "grad_norm": 0.5566958649087362, "learning_rate": 5.217645817040194e-06, "loss": 0.071, "step": 9883 }, { "epoch": 0.7695197313999319, "grad_norm": 0.502269353528707, "learning_rate": 5.214283915852945e-06, "loss": 0.0671, "step": 9884 }, { "epoch": 0.7695975864921415, "grad_norm": 0.5354066974700216, "learning_rate": 5.210922935746294e-06, "loss": 0.0716, "step": 9885 }, { "epoch": 0.7696754415843511, "grad_norm": 0.40208404007303433, "learning_rate": 5.207562876929637e-06, "loss": 0.0297, "step": 9886 }, { "epoch": 0.7697532966765608, "grad_norm": 0.3421016543930915, "learning_rate": 5.204203739612284e-06, "loss": 0.0231, "step": 9887 }, { "epoch": 0.7698311517687704, "grad_norm": 0.40908913751574916, "learning_rate": 5.200845524003489e-06, "loss": 0.0447, "step": 9888 }, { "epoch": 0.76990900686098, "grad_norm": 0.4248791379159484, "learning_rate": 5.197488230312457e-06, "loss": 0.0306, "step": 9889 }, { "epoch": 0.7699868619531897, "grad_norm": 0.39570380010668477, "learning_rate": 5.194131858748323e-06, "loss": 0.027, "step": 9890 }, { "epoch": 0.7700647170453992, "grad_norm": 0.3931264901685421, "learning_rate": 5.190776409520188e-06, "loss": 0.031, "step": 9891 }, { "epoch": 0.7701425721376088, "grad_norm": 0.42393665906295436, "learning_rate": 5.187421882837074e-06, "loss": 0.0443, "step": 9892 }, { "epoch": 0.7702204272298185, "grad_norm": 0.36370865407388514, "learning_rate": 5.184068278907948e-06, "loss": 0.0271, "step": 9893 }, { "epoch": 0.7702982823220281, "grad_norm": 0.4209238300318379, "learning_rate": 5.180715597941737e-06, "loss": 0.0417, "step": 9894 }, { "epoch": 0.7703761374142377, "grad_norm": 0.5602065550789288, "learning_rate": 5.1773638401472735e-06, "loss": 0.0643, "step": 9895 }, { "epoch": 0.7704539925064474, "grad_norm": 0.49497264743694797, "learning_rate": 5.174013005733376e-06, "loss": 0.0578, "step": 9896 }, { "epoch": 0.770531847598657, "grad_norm": 0.5082827314494045, "learning_rate": 5.170663094908779e-06, "loss": 0.0408, "step": 9897 }, { "epoch": 0.7706097026908666, "grad_norm": 0.5393876261718816, "learning_rate": 5.167314107882169e-06, "loss": 0.064, "step": 9898 }, { "epoch": 0.7706875577830763, "grad_norm": 0.523939495614992, "learning_rate": 5.1639660448621695e-06, "loss": 0.0466, "step": 9899 }, { "epoch": 0.7707654128752859, "grad_norm": 0.5437083943850629, "learning_rate": 5.160618906057351e-06, "loss": 0.0542, "step": 9900 }, { "epoch": 0.7707654128752859, "eval_loss": 0.006329935044050217, "eval_runtime": 163.0479, "eval_samples_per_second": 17.664, "eval_steps_per_second": 0.632, "step": 9900 }, { "epoch": 0.7708432679674955, "grad_norm": 0.3739576910748595, "learning_rate": 5.157272691676216e-06, "loss": 0.0329, "step": 9901 }, { "epoch": 0.7709211230597052, "grad_norm": 0.4364934209761968, "learning_rate": 5.153927401927232e-06, "loss": 0.0434, "step": 9902 }, { "epoch": 0.7709989781519148, "grad_norm": 0.37663561419102976, "learning_rate": 5.1505830370187944e-06, "loss": 0.0348, "step": 9903 }, { "epoch": 0.7710768332441243, "grad_norm": 0.4117787465065817, "learning_rate": 5.147239597159228e-06, "loss": 0.0344, "step": 9904 }, { "epoch": 0.771154688336334, "grad_norm": 0.5445256848804279, "learning_rate": 5.143897082556821e-06, "loss": 0.0502, "step": 9905 }, { "epoch": 0.7712325434285436, "grad_norm": 0.4929910460635318, "learning_rate": 5.140555493419792e-06, "loss": 0.0561, "step": 9906 }, { "epoch": 0.7713103985207532, "grad_norm": 0.45255606200856413, "learning_rate": 5.137214829956314e-06, "loss": 0.0427, "step": 9907 }, { "epoch": 0.7713882536129629, "grad_norm": 0.7728342996389581, "learning_rate": 5.133875092374492e-06, "loss": 0.0985, "step": 9908 }, { "epoch": 0.7714661087051725, "grad_norm": 0.5979431331432876, "learning_rate": 5.130536280882373e-06, "loss": 0.073, "step": 9909 }, { "epoch": 0.7715439637973821, "grad_norm": 0.48790475268834205, "learning_rate": 5.127198395687953e-06, "loss": 0.0566, "step": 9910 }, { "epoch": 0.7716218188895917, "grad_norm": 0.37830751087033787, "learning_rate": 5.123861436999158e-06, "loss": 0.0294, "step": 9911 }, { "epoch": 0.7716996739818014, "grad_norm": 0.5800395905729795, "learning_rate": 5.120525405023881e-06, "loss": 0.0623, "step": 9912 }, { "epoch": 0.771777529074011, "grad_norm": 0.5994360360206462, "learning_rate": 5.117190299969929e-06, "loss": 0.0687, "step": 9913 }, { "epoch": 0.7718553841662206, "grad_norm": 0.5456331334364256, "learning_rate": 5.113856122045065e-06, "loss": 0.0544, "step": 9914 }, { "epoch": 0.7719332392584303, "grad_norm": 0.46130371892728433, "learning_rate": 5.110522871456993e-06, "loss": 0.0481, "step": 9915 }, { "epoch": 0.7720110943506399, "grad_norm": 0.4329391832941962, "learning_rate": 5.107190548413355e-06, "loss": 0.0356, "step": 9916 }, { "epoch": 0.7720889494428494, "grad_norm": 0.4651387994526113, "learning_rate": 5.1038591531217465e-06, "loss": 0.0472, "step": 9917 }, { "epoch": 0.7721668045350591, "grad_norm": 0.45973368319931796, "learning_rate": 5.100528685789696e-06, "loss": 0.0519, "step": 9918 }, { "epoch": 0.7722446596272687, "grad_norm": 0.5637371975346823, "learning_rate": 5.097199146624676e-06, "loss": 0.0645, "step": 9919 }, { "epoch": 0.7723225147194783, "grad_norm": 0.5273665930045766, "learning_rate": 5.093870535834098e-06, "loss": 0.0631, "step": 9920 }, { "epoch": 0.772400369811688, "grad_norm": 0.5336784492450287, "learning_rate": 5.090542853625322e-06, "loss": 0.058, "step": 9921 }, { "epoch": 0.7724782249038976, "grad_norm": 0.4106884985105055, "learning_rate": 5.087216100205645e-06, "loss": 0.0279, "step": 9922 }, { "epoch": 0.7725560799961072, "grad_norm": 0.47841117889288964, "learning_rate": 5.083890275782308e-06, "loss": 0.0461, "step": 9923 }, { "epoch": 0.7726339350883169, "grad_norm": 0.5347732248796563, "learning_rate": 5.080565380562496e-06, "loss": 0.0561, "step": 9924 }, { "epoch": 0.7727117901805265, "grad_norm": 0.43071054596405733, "learning_rate": 5.077241414753333e-06, "loss": 0.0321, "step": 9925 }, { "epoch": 0.7727896452727361, "grad_norm": 0.573648851821008, "learning_rate": 5.0739183785618795e-06, "loss": 0.0825, "step": 9926 }, { "epoch": 0.7728675003649458, "grad_norm": 0.44358489702213194, "learning_rate": 5.070596272195161e-06, "loss": 0.0425, "step": 9927 }, { "epoch": 0.7729453554571554, "grad_norm": 0.4338113805651441, "learning_rate": 5.067275095860118e-06, "loss": 0.0362, "step": 9928 }, { "epoch": 0.773023210549365, "grad_norm": 0.4168424986396583, "learning_rate": 5.063954849763646e-06, "loss": 0.0384, "step": 9929 }, { "epoch": 0.7731010656415747, "grad_norm": 0.5084291094331719, "learning_rate": 5.060635534112583e-06, "loss": 0.0548, "step": 9930 }, { "epoch": 0.7731789207337842, "grad_norm": 0.48066801565133177, "learning_rate": 5.057317149113705e-06, "loss": 0.0435, "step": 9931 }, { "epoch": 0.7732567758259938, "grad_norm": 0.3990009768934366, "learning_rate": 5.053999694973731e-06, "loss": 0.0312, "step": 9932 }, { "epoch": 0.7733346309182035, "grad_norm": 0.4143962288313665, "learning_rate": 5.050683171899324e-06, "loss": 0.0354, "step": 9933 }, { "epoch": 0.7734124860104131, "grad_norm": 0.5047822092574404, "learning_rate": 5.047367580097087e-06, "loss": 0.0474, "step": 9934 }, { "epoch": 0.7734903411026227, "grad_norm": 0.5606409151647144, "learning_rate": 5.044052919773565e-06, "loss": 0.0627, "step": 9935 }, { "epoch": 0.7735681961948324, "grad_norm": 0.46980111434499755, "learning_rate": 5.040739191135244e-06, "loss": 0.0452, "step": 9936 }, { "epoch": 0.773646051287042, "grad_norm": 0.4830059297951916, "learning_rate": 5.03742639438856e-06, "loss": 0.0464, "step": 9937 }, { "epoch": 0.7737239063792516, "grad_norm": 0.45902752100218336, "learning_rate": 5.034114529739882e-06, "loss": 0.0477, "step": 9938 }, { "epoch": 0.7738017614714613, "grad_norm": 0.44531397072919376, "learning_rate": 5.030803597395524e-06, "loss": 0.04, "step": 9939 }, { "epoch": 0.7738796165636709, "grad_norm": 0.4711622997376128, "learning_rate": 5.027493597561739e-06, "loss": 0.0511, "step": 9940 }, { "epoch": 0.7739574716558805, "grad_norm": 0.5080311275129475, "learning_rate": 5.024184530444727e-06, "loss": 0.0495, "step": 9941 }, { "epoch": 0.7740353267480902, "grad_norm": 0.3890966418391768, "learning_rate": 5.020876396250624e-06, "loss": 0.0374, "step": 9942 }, { "epoch": 0.7741131818402998, "grad_norm": 0.47007514548935586, "learning_rate": 5.0175691951855165e-06, "loss": 0.0545, "step": 9943 }, { "epoch": 0.7741910369325093, "grad_norm": 0.5894445683711108, "learning_rate": 5.0142629274554245e-06, "loss": 0.06, "step": 9944 }, { "epoch": 0.774268892024719, "grad_norm": 0.4539357034199025, "learning_rate": 5.01095759326631e-06, "loss": 0.0377, "step": 9945 }, { "epoch": 0.7743467471169286, "grad_norm": 0.30424995576595226, "learning_rate": 5.007653192824084e-06, "loss": 0.025, "step": 9946 }, { "epoch": 0.7744246022091382, "grad_norm": 0.913851476796746, "learning_rate": 5.00434972633459e-06, "loss": 0.073, "step": 9947 }, { "epoch": 0.7745024573013478, "grad_norm": 0.4978514914165158, "learning_rate": 5.001047194003625e-06, "loss": 0.0435, "step": 9948 }, { "epoch": 0.7745803123935575, "grad_norm": 0.39609925738360163, "learning_rate": 4.997745596036921e-06, "loss": 0.0275, "step": 9949 }, { "epoch": 0.7746581674857671, "grad_norm": 0.5487024660628466, "learning_rate": 4.994444932640148e-06, "loss": 0.0548, "step": 9950 }, { "epoch": 0.7746581674857671, "eval_loss": 0.006307026371359825, "eval_runtime": 163.1553, "eval_samples_per_second": 17.652, "eval_steps_per_second": 0.631, "step": 9950 }, { "epoch": 0.7747360225779767, "grad_norm": 0.6075568710554237, "learning_rate": 4.99114520401893e-06, "loss": 0.0695, "step": 9951 }, { "epoch": 0.7748138776701864, "grad_norm": 0.45478750510773086, "learning_rate": 4.9878464103788045e-06, "loss": 0.0411, "step": 9952 }, { "epoch": 0.774891732762396, "grad_norm": 0.4851373844829964, "learning_rate": 4.984548551925292e-06, "loss": 0.0555, "step": 9953 }, { "epoch": 0.7749695878546056, "grad_norm": 0.45452137520951474, "learning_rate": 4.981251628863826e-06, "loss": 0.0293, "step": 9954 }, { "epoch": 0.7750474429468153, "grad_norm": 0.6171923294423357, "learning_rate": 4.977955641399788e-06, "loss": 0.0717, "step": 9955 }, { "epoch": 0.7751252980390249, "grad_norm": 0.442496064296817, "learning_rate": 4.974660589738504e-06, "loss": 0.0358, "step": 9956 }, { "epoch": 0.7752031531312344, "grad_norm": 0.4458707480098359, "learning_rate": 4.971366474085235e-06, "loss": 0.0372, "step": 9957 }, { "epoch": 0.7752810082234441, "grad_norm": 0.5062149482158428, "learning_rate": 4.968073294645199e-06, "loss": 0.0477, "step": 9958 }, { "epoch": 0.7753588633156537, "grad_norm": 0.49842510455639266, "learning_rate": 4.964781051623539e-06, "loss": 0.0556, "step": 9959 }, { "epoch": 0.7754367184078633, "grad_norm": 0.36869732592210697, "learning_rate": 4.961489745225356e-06, "loss": 0.0266, "step": 9960 }, { "epoch": 0.775514573500073, "grad_norm": 0.48067125361881297, "learning_rate": 4.958199375655668e-06, "loss": 0.051, "step": 9961 }, { "epoch": 0.7755924285922826, "grad_norm": 0.7108443586854261, "learning_rate": 4.95490994311945e-06, "loss": 0.0796, "step": 9962 }, { "epoch": 0.7756702836844922, "grad_norm": 0.37911886411779144, "learning_rate": 4.951621447821631e-06, "loss": 0.0298, "step": 9963 }, { "epoch": 0.7757481387767019, "grad_norm": 0.47874765588544854, "learning_rate": 4.948333889967061e-06, "loss": 0.0459, "step": 9964 }, { "epoch": 0.7758259938689115, "grad_norm": 0.529923500348832, "learning_rate": 4.945047269760541e-06, "loss": 0.0541, "step": 9965 }, { "epoch": 0.7759038489611211, "grad_norm": 0.4523699317742582, "learning_rate": 4.94176158740681e-06, "loss": 0.0368, "step": 9966 }, { "epoch": 0.7759817040533308, "grad_norm": 0.4777348302494519, "learning_rate": 4.938476843110549e-06, "loss": 0.0382, "step": 9967 }, { "epoch": 0.7760595591455404, "grad_norm": 0.5548374513120787, "learning_rate": 4.935193037076391e-06, "loss": 0.0585, "step": 9968 }, { "epoch": 0.77613741423775, "grad_norm": 0.4394272022987922, "learning_rate": 4.9319101695089e-06, "loss": 0.0393, "step": 9969 }, { "epoch": 0.7762152693299597, "grad_norm": 0.44250863867574175, "learning_rate": 4.928628240612574e-06, "loss": 0.0386, "step": 9970 }, { "epoch": 0.7762931244221692, "grad_norm": 0.4724321130610411, "learning_rate": 4.925347250591868e-06, "loss": 0.0463, "step": 9971 }, { "epoch": 0.7763709795143788, "grad_norm": 0.42475919942856094, "learning_rate": 4.922067199651163e-06, "loss": 0.0478, "step": 9972 }, { "epoch": 0.7764488346065885, "grad_norm": 0.5335450374806484, "learning_rate": 4.918788087994808e-06, "loss": 0.0512, "step": 9973 }, { "epoch": 0.7765266896987981, "grad_norm": 0.5592971309946791, "learning_rate": 4.9155099158270636e-06, "loss": 0.0431, "step": 9974 }, { "epoch": 0.7766045447910077, "grad_norm": 0.4192942631492275, "learning_rate": 4.912232683352152e-06, "loss": 0.037, "step": 9975 }, { "epoch": 0.7766823998832174, "grad_norm": 0.41014811562730574, "learning_rate": 4.908956390774222e-06, "loss": 0.0308, "step": 9976 }, { "epoch": 0.776760254975427, "grad_norm": 0.516971006288193, "learning_rate": 4.905681038297377e-06, "loss": 0.05, "step": 9977 }, { "epoch": 0.7768381100676366, "grad_norm": 0.4109099512902995, "learning_rate": 4.9024066261256505e-06, "loss": 0.0358, "step": 9978 }, { "epoch": 0.7769159651598463, "grad_norm": 0.4255167471689502, "learning_rate": 4.89913315446303e-06, "loss": 0.0419, "step": 9979 }, { "epoch": 0.7769938202520559, "grad_norm": 0.45424331743044766, "learning_rate": 4.895860623513431e-06, "loss": 0.0534, "step": 9980 }, { "epoch": 0.7770716753442655, "grad_norm": 0.5012672948298308, "learning_rate": 4.8925890334807216e-06, "loss": 0.05, "step": 9981 }, { "epoch": 0.7771495304364751, "grad_norm": 0.512920719283941, "learning_rate": 4.889318384568702e-06, "loss": 0.0505, "step": 9982 }, { "epoch": 0.7772273855286848, "grad_norm": 0.45202265628206817, "learning_rate": 4.886048676981117e-06, "loss": 0.0461, "step": 9983 }, { "epoch": 0.7773052406208943, "grad_norm": 0.3134717582207675, "learning_rate": 4.882779910921662e-06, "loss": 0.023, "step": 9984 }, { "epoch": 0.7773830957131039, "grad_norm": 0.5114642488782806, "learning_rate": 4.87951208659396e-06, "loss": 0.0395, "step": 9985 }, { "epoch": 0.7774609508053136, "grad_norm": 0.38167216195837406, "learning_rate": 4.876245204201586e-06, "loss": 0.0342, "step": 9986 }, { "epoch": 0.7775388058975232, "grad_norm": 0.5733959983751533, "learning_rate": 4.8729792639480455e-06, "loss": 0.0614, "step": 9987 }, { "epoch": 0.7776166609897328, "grad_norm": 0.520309123083762, "learning_rate": 4.8697142660367934e-06, "loss": 0.0669, "step": 9988 }, { "epoch": 0.7776945160819425, "grad_norm": 0.4858171939266944, "learning_rate": 4.866450210671225e-06, "loss": 0.0467, "step": 9989 }, { "epoch": 0.7777723711741521, "grad_norm": 0.39442958620716656, "learning_rate": 4.863187098054673e-06, "loss": 0.0359, "step": 9990 }, { "epoch": 0.7778502262663617, "grad_norm": 0.5775067616586215, "learning_rate": 4.859924928390418e-06, "loss": 0.0665, "step": 9991 }, { "epoch": 0.7779280813585714, "grad_norm": 0.607859522147855, "learning_rate": 4.856663701881674e-06, "loss": 0.068, "step": 9992 }, { "epoch": 0.778005936450781, "grad_norm": 0.34914086068237116, "learning_rate": 4.8534034187315945e-06, "loss": 0.0258, "step": 9993 }, { "epoch": 0.7780837915429906, "grad_norm": 0.47553986506213425, "learning_rate": 4.850144079143295e-06, "loss": 0.0446, "step": 9994 }, { "epoch": 0.7781616466352003, "grad_norm": 0.476015251641557, "learning_rate": 4.846885683319807e-06, "loss": 0.0291, "step": 9995 }, { "epoch": 0.7782395017274099, "grad_norm": 0.4565502306243688, "learning_rate": 4.843628231464115e-06, "loss": 0.0405, "step": 9996 }, { "epoch": 0.7783173568196194, "grad_norm": 0.49195756830302334, "learning_rate": 4.840371723779143e-06, "loss": 0.0442, "step": 9997 }, { "epoch": 0.7783952119118291, "grad_norm": 0.4807513135605422, "learning_rate": 4.837116160467756e-06, "loss": 0.0402, "step": 9998 }, { "epoch": 0.7784730670040387, "grad_norm": 0.4984966966757639, "learning_rate": 4.83386154173276e-06, "loss": 0.0507, "step": 9999 }, { "epoch": 0.7785509220962483, "grad_norm": 0.5428569707915515, "learning_rate": 4.830607867776904e-06, "loss": 0.0625, "step": 10000 }, { "epoch": 0.7785509220962483, "eval_loss": 0.006274817977100611, "eval_runtime": 162.5674, "eval_samples_per_second": 17.716, "eval_steps_per_second": 0.634, "step": 10000 }, { "epoch": 0.778628777188458, "grad_norm": 0.5132007382025836, "learning_rate": 4.827355138802874e-06, "loss": 0.0577, "step": 10001 }, { "epoch": 0.7787066322806676, "grad_norm": 0.5592484391275817, "learning_rate": 4.824103355013303e-06, "loss": 0.0655, "step": 10002 }, { "epoch": 0.7787844873728772, "grad_norm": 0.5031998132225471, "learning_rate": 4.820852516610752e-06, "loss": 0.0632, "step": 10003 }, { "epoch": 0.7788623424650869, "grad_norm": 0.41447319158646595, "learning_rate": 4.817602623797748e-06, "loss": 0.0352, "step": 10004 }, { "epoch": 0.7789401975572965, "grad_norm": 0.47660018970276335, "learning_rate": 4.814353676776737e-06, "loss": 0.044, "step": 10005 }, { "epoch": 0.7790180526495061, "grad_norm": 0.4512707862030496, "learning_rate": 4.811105675750114e-06, "loss": 0.0339, "step": 10006 }, { "epoch": 0.7790959077417158, "grad_norm": 0.520283939773381, "learning_rate": 4.807858620920212e-06, "loss": 0.0594, "step": 10007 }, { "epoch": 0.7791737628339254, "grad_norm": 0.6804632802940797, "learning_rate": 4.804612512489308e-06, "loss": 0.0844, "step": 10008 }, { "epoch": 0.779251617926135, "grad_norm": 0.468580826398244, "learning_rate": 4.80136735065962e-06, "loss": 0.0438, "step": 10009 }, { "epoch": 0.7793294730183447, "grad_norm": 0.40616003632445546, "learning_rate": 4.798123135633308e-06, "loss": 0.0361, "step": 10010 }, { "epoch": 0.7794073281105542, "grad_norm": 0.3965105754964712, "learning_rate": 4.794879867612468e-06, "loss": 0.0403, "step": 10011 }, { "epoch": 0.7794851832027638, "grad_norm": 0.47919658259216363, "learning_rate": 4.791637546799141e-06, "loss": 0.0613, "step": 10012 }, { "epoch": 0.7795630382949735, "grad_norm": 0.4863593012847097, "learning_rate": 4.7883961733953045e-06, "loss": 0.0576, "step": 10013 }, { "epoch": 0.7796408933871831, "grad_norm": 0.4196524377110354, "learning_rate": 4.785155747602892e-06, "loss": 0.0367, "step": 10014 }, { "epoch": 0.7797187484793927, "grad_norm": 0.4705968958741191, "learning_rate": 4.7819162696237605e-06, "loss": 0.0469, "step": 10015 }, { "epoch": 0.7797966035716024, "grad_norm": 0.5288665985895569, "learning_rate": 4.778677739659712e-06, "loss": 0.0582, "step": 10016 }, { "epoch": 0.779874458663812, "grad_norm": 0.4428270825747468, "learning_rate": 4.775440157912497e-06, "loss": 0.0471, "step": 10017 }, { "epoch": 0.7799523137560216, "grad_norm": 0.33440308607757235, "learning_rate": 4.772203524583801e-06, "loss": 0.0207, "step": 10018 }, { "epoch": 0.7800301688482312, "grad_norm": 0.45487157563058894, "learning_rate": 4.768967839875238e-06, "loss": 0.0502, "step": 10019 }, { "epoch": 0.7801080239404409, "grad_norm": 0.43114477796575074, "learning_rate": 4.765733103988393e-06, "loss": 0.0393, "step": 10020 }, { "epoch": 0.7801858790326505, "grad_norm": 0.5944114070586553, "learning_rate": 4.762499317124767e-06, "loss": 0.086, "step": 10021 }, { "epoch": 0.78026373412486, "grad_norm": 0.5403740582983318, "learning_rate": 4.759266479485811e-06, "loss": 0.0645, "step": 10022 }, { "epoch": 0.7803415892170698, "grad_norm": 0.4490596899908574, "learning_rate": 4.756034591272918e-06, "loss": 0.024, "step": 10023 }, { "epoch": 0.7804194443092793, "grad_norm": 0.6019429432160347, "learning_rate": 4.752803652687408e-06, "loss": 0.0834, "step": 10024 }, { "epoch": 0.7804972994014889, "grad_norm": 0.446267800271624, "learning_rate": 4.749573663930569e-06, "loss": 0.0379, "step": 10025 }, { "epoch": 0.7805751544936986, "grad_norm": 0.3885395483567994, "learning_rate": 4.746344625203609e-06, "loss": 0.0228, "step": 10026 }, { "epoch": 0.7806530095859082, "grad_norm": 0.492259876028065, "learning_rate": 4.7431165367076835e-06, "loss": 0.0484, "step": 10027 }, { "epoch": 0.7807308646781178, "grad_norm": 0.4953518718236272, "learning_rate": 4.73988939864388e-06, "loss": 0.0476, "step": 10028 }, { "epoch": 0.7808087197703275, "grad_norm": 0.4775138500042285, "learning_rate": 4.7366632112132325e-06, "loss": 0.06, "step": 10029 }, { "epoch": 0.7808865748625371, "grad_norm": 0.6038066204268899, "learning_rate": 4.733437974616728e-06, "loss": 0.0755, "step": 10030 }, { "epoch": 0.7809644299547467, "grad_norm": 0.4209794013549072, "learning_rate": 4.730213689055278e-06, "loss": 0.0319, "step": 10031 }, { "epoch": 0.7810422850469564, "grad_norm": 0.4838183340579001, "learning_rate": 4.726990354729744e-06, "loss": 0.0496, "step": 10032 }, { "epoch": 0.781120140139166, "grad_norm": 0.4936626763144829, "learning_rate": 4.723767971840918e-06, "loss": 0.0524, "step": 10033 }, { "epoch": 0.7811979952313756, "grad_norm": 0.3519512158431991, "learning_rate": 4.720546540589536e-06, "loss": 0.0344, "step": 10034 }, { "epoch": 0.7812758503235853, "grad_norm": 0.504217087577108, "learning_rate": 4.717326061176295e-06, "loss": 0.0409, "step": 10035 }, { "epoch": 0.7813537054157949, "grad_norm": 0.48524970974165993, "learning_rate": 4.7141065338018075e-06, "loss": 0.043, "step": 10036 }, { "epoch": 0.7814315605080044, "grad_norm": 0.4493259143209198, "learning_rate": 4.7108879586666274e-06, "loss": 0.0438, "step": 10037 }, { "epoch": 0.7815094156002141, "grad_norm": 0.45869400947128725, "learning_rate": 4.707670335971264e-06, "loss": 0.0363, "step": 10038 }, { "epoch": 0.7815872706924237, "grad_norm": 0.44288966232740823, "learning_rate": 4.704453665916153e-06, "loss": 0.0413, "step": 10039 }, { "epoch": 0.7816651257846333, "grad_norm": 0.44107218474849086, "learning_rate": 4.701237948701687e-06, "loss": 0.0436, "step": 10040 }, { "epoch": 0.781742980876843, "grad_norm": 0.3905005522994527, "learning_rate": 4.698023184528187e-06, "loss": 0.0285, "step": 10041 }, { "epoch": 0.7818208359690526, "grad_norm": 0.4669477811827144, "learning_rate": 4.694809373595917e-06, "loss": 0.0444, "step": 10042 }, { "epoch": 0.7818986910612622, "grad_norm": 0.4812929998041583, "learning_rate": 4.6915965161050835e-06, "loss": 0.0455, "step": 10043 }, { "epoch": 0.7819765461534719, "grad_norm": 0.5565221424950751, "learning_rate": 4.688384612255823e-06, "loss": 0.0609, "step": 10044 }, { "epoch": 0.7820544012456815, "grad_norm": 0.565914152276299, "learning_rate": 4.685173662248243e-06, "loss": 0.0592, "step": 10045 }, { "epoch": 0.7821322563378911, "grad_norm": 0.5013207199682161, "learning_rate": 4.681963666282352e-06, "loss": 0.0489, "step": 10046 }, { "epoch": 0.7822101114301008, "grad_norm": 0.592150560219642, "learning_rate": 4.678754624558124e-06, "loss": 0.0707, "step": 10047 }, { "epoch": 0.7822879665223104, "grad_norm": 0.4946741473034796, "learning_rate": 4.675546537275466e-06, "loss": 0.0577, "step": 10048 }, { "epoch": 0.78236582161452, "grad_norm": 0.39621552428639517, "learning_rate": 4.6723394046342205e-06, "loss": 0.0384, "step": 10049 }, { "epoch": 0.7824436767067297, "grad_norm": 0.5403527603118834, "learning_rate": 4.669133226834193e-06, "loss": 0.0629, "step": 10050 }, { "epoch": 0.7824436767067297, "eval_loss": 0.006244319956749678, "eval_runtime": 162.6144, "eval_samples_per_second": 17.711, "eval_steps_per_second": 0.633, "step": 10050 }, { "epoch": 0.7825215317989392, "grad_norm": 0.5825898983390326, "learning_rate": 4.6659280040751e-06, "loss": 0.0653, "step": 10051 }, { "epoch": 0.7825993868911488, "grad_norm": 0.48403109664776417, "learning_rate": 4.662723736556618e-06, "loss": 0.0635, "step": 10052 }, { "epoch": 0.7826772419833585, "grad_norm": 0.4633526272938232, "learning_rate": 4.659520424478356e-06, "loss": 0.0538, "step": 10053 }, { "epoch": 0.7827550970755681, "grad_norm": 0.4753613293304323, "learning_rate": 4.6563180680398644e-06, "loss": 0.0401, "step": 10054 }, { "epoch": 0.7828329521677777, "grad_norm": 0.458098341903255, "learning_rate": 4.653116667440636e-06, "loss": 0.0515, "step": 10055 }, { "epoch": 0.7829108072599873, "grad_norm": 0.5818711768020278, "learning_rate": 4.649916222880102e-06, "loss": 0.0761, "step": 10056 }, { "epoch": 0.782988662352197, "grad_norm": 0.31204348811873184, "learning_rate": 4.646716734557635e-06, "loss": 0.0215, "step": 10057 }, { "epoch": 0.7830665174444066, "grad_norm": 0.3812418699205883, "learning_rate": 4.643518202672548e-06, "loss": 0.0255, "step": 10058 }, { "epoch": 0.7831443725366162, "grad_norm": 0.4029870840756976, "learning_rate": 4.640320627424097e-06, "loss": 0.0419, "step": 10059 }, { "epoch": 0.7832222276288259, "grad_norm": 0.5446657755740705, "learning_rate": 4.6371240090114665e-06, "loss": 0.0571, "step": 10060 }, { "epoch": 0.7833000827210355, "grad_norm": 0.4902941588434069, "learning_rate": 4.6339283476338045e-06, "loss": 0.0462, "step": 10061 }, { "epoch": 0.783377937813245, "grad_norm": 0.4363936066991916, "learning_rate": 4.630733643490182e-06, "loss": 0.0371, "step": 10062 }, { "epoch": 0.7834557929054548, "grad_norm": 0.43608695084390053, "learning_rate": 4.627539896779609e-06, "loss": 0.0502, "step": 10063 }, { "epoch": 0.7835336479976643, "grad_norm": 0.4587986746253261, "learning_rate": 4.624347107701044e-06, "loss": 0.0353, "step": 10064 }, { "epoch": 0.7836115030898739, "grad_norm": 0.3735711566294658, "learning_rate": 4.621155276453382e-06, "loss": 0.0289, "step": 10065 }, { "epoch": 0.7836893581820836, "grad_norm": 0.4646902192916106, "learning_rate": 4.61796440323546e-06, "loss": 0.0377, "step": 10066 }, { "epoch": 0.7837672132742932, "grad_norm": 0.4991306588079822, "learning_rate": 4.614774488246055e-06, "loss": 0.0722, "step": 10067 }, { "epoch": 0.7838450683665028, "grad_norm": 0.47668256988039137, "learning_rate": 4.61158553168388e-06, "loss": 0.0462, "step": 10068 }, { "epoch": 0.7839229234587125, "grad_norm": 0.48896766908893324, "learning_rate": 4.608397533747595e-06, "loss": 0.0484, "step": 10069 }, { "epoch": 0.7840007785509221, "grad_norm": 0.47637206197009285, "learning_rate": 4.60521049463579e-06, "loss": 0.0401, "step": 10070 }, { "epoch": 0.7840786336431317, "grad_norm": 0.49582451431148383, "learning_rate": 4.602024414547017e-06, "loss": 0.0497, "step": 10071 }, { "epoch": 0.7841564887353414, "grad_norm": 0.40212651330209515, "learning_rate": 4.598839293679744e-06, "loss": 0.0341, "step": 10072 }, { "epoch": 0.784234343827551, "grad_norm": 0.4389714071058949, "learning_rate": 4.595655132232395e-06, "loss": 0.0412, "step": 10073 }, { "epoch": 0.7843121989197606, "grad_norm": 0.5262422549212964, "learning_rate": 4.592471930403322e-06, "loss": 0.0668, "step": 10074 }, { "epoch": 0.7843900540119703, "grad_norm": 0.3824695241224627, "learning_rate": 4.589289688390825e-06, "loss": 0.0359, "step": 10075 }, { "epoch": 0.7844679091041799, "grad_norm": 0.43033259001313, "learning_rate": 4.586108406393144e-06, "loss": 0.0441, "step": 10076 }, { "epoch": 0.7845457641963894, "grad_norm": 0.6270385898059418, "learning_rate": 4.582928084608458e-06, "loss": 0.073, "step": 10077 }, { "epoch": 0.7846236192885991, "grad_norm": 0.36461138228434603, "learning_rate": 4.579748723234887e-06, "loss": 0.0286, "step": 10078 }, { "epoch": 0.7847014743808087, "grad_norm": 0.3928207890616527, "learning_rate": 4.576570322470488e-06, "loss": 0.0369, "step": 10079 }, { "epoch": 0.7847793294730183, "grad_norm": 0.4481434641055158, "learning_rate": 4.573392882513255e-06, "loss": 0.0422, "step": 10080 }, { "epoch": 0.784857184565228, "grad_norm": 0.4552174914604178, "learning_rate": 4.570216403561141e-06, "loss": 0.0296, "step": 10081 }, { "epoch": 0.7849350396574376, "grad_norm": 0.4236262744967315, "learning_rate": 4.567040885812017e-06, "loss": 0.0382, "step": 10082 }, { "epoch": 0.7850128947496472, "grad_norm": 0.5470963764556572, "learning_rate": 4.563866329463706e-06, "loss": 0.0559, "step": 10083 }, { "epoch": 0.7850907498418569, "grad_norm": 0.5415460184732969, "learning_rate": 4.560692734713965e-06, "loss": 0.0585, "step": 10084 }, { "epoch": 0.7851686049340665, "grad_norm": 0.3801112252397395, "learning_rate": 4.5575201017604975e-06, "loss": 0.0344, "step": 10085 }, { "epoch": 0.7852464600262761, "grad_norm": 0.32377928866423505, "learning_rate": 4.5543484308009386e-06, "loss": 0.0271, "step": 10086 }, { "epoch": 0.7853243151184858, "grad_norm": 0.5040657196980016, "learning_rate": 4.551177722032873e-06, "loss": 0.0528, "step": 10087 }, { "epoch": 0.7854021702106954, "grad_norm": 0.4311549709714528, "learning_rate": 4.548007975653821e-06, "loss": 0.0416, "step": 10088 }, { "epoch": 0.785480025302905, "grad_norm": 0.5117585712790355, "learning_rate": 4.544839191861241e-06, "loss": 0.0542, "step": 10089 }, { "epoch": 0.7855578803951145, "grad_norm": 0.44363204118997207, "learning_rate": 4.541671370852525e-06, "loss": 0.0347, "step": 10090 }, { "epoch": 0.7856357354873242, "grad_norm": 0.3969273312134276, "learning_rate": 4.5385045128250305e-06, "loss": 0.0347, "step": 10091 }, { "epoch": 0.7857135905795338, "grad_norm": 0.4301534832343127, "learning_rate": 4.535338617976029e-06, "loss": 0.0365, "step": 10092 }, { "epoch": 0.7857914456717434, "grad_norm": 0.43175699057055883, "learning_rate": 4.532173686502741e-06, "loss": 0.0361, "step": 10093 }, { "epoch": 0.7858693007639531, "grad_norm": 0.49769357394460195, "learning_rate": 4.529009718602333e-06, "loss": 0.061, "step": 10094 }, { "epoch": 0.7859471558561627, "grad_norm": 0.5123957869147352, "learning_rate": 4.525846714471893e-06, "loss": 0.0534, "step": 10095 }, { "epoch": 0.7860250109483723, "grad_norm": 0.4970502949772666, "learning_rate": 4.5226846743084615e-06, "loss": 0.0447, "step": 10096 }, { "epoch": 0.786102866040582, "grad_norm": 0.350083385428647, "learning_rate": 4.519523598309032e-06, "loss": 0.0296, "step": 10097 }, { "epoch": 0.7861807211327916, "grad_norm": 0.3508074026132466, "learning_rate": 4.5163634866705185e-06, "loss": 0.0261, "step": 10098 }, { "epoch": 0.7862585762250012, "grad_norm": 0.485085663819924, "learning_rate": 4.513204339589778e-06, "loss": 0.0455, "step": 10099 }, { "epoch": 0.7863364313172109, "grad_norm": 0.44158709531632084, "learning_rate": 4.510046157263615e-06, "loss": 0.0548, "step": 10100 }, { "epoch": 0.7863364313172109, "eval_loss": 0.006196147762238979, "eval_runtime": 162.063, "eval_samples_per_second": 17.771, "eval_steps_per_second": 0.636, "step": 10100 }, { "epoch": 0.7864142864094205, "grad_norm": 0.42390078196857833, "learning_rate": 4.50688893988876e-06, "loss": 0.0365, "step": 10101 }, { "epoch": 0.78649214150163, "grad_norm": 0.43827659673678443, "learning_rate": 4.503732687661906e-06, "loss": 0.0402, "step": 10102 }, { "epoch": 0.7865699965938397, "grad_norm": 0.36730999202634884, "learning_rate": 4.500577400779673e-06, "loss": 0.0322, "step": 10103 }, { "epoch": 0.7866478516860493, "grad_norm": 0.4162710664119154, "learning_rate": 4.497423079438608e-06, "loss": 0.035, "step": 10104 }, { "epoch": 0.7867257067782589, "grad_norm": 0.29607511219787247, "learning_rate": 4.494269723835216e-06, "loss": 0.0195, "step": 10105 }, { "epoch": 0.7868035618704686, "grad_norm": 0.36144468815514114, "learning_rate": 4.491117334165933e-06, "loss": 0.0298, "step": 10106 }, { "epoch": 0.7868814169626782, "grad_norm": 0.502729500743898, "learning_rate": 4.487965910627146e-06, "loss": 0.0543, "step": 10107 }, { "epoch": 0.7869592720548878, "grad_norm": 0.5357583709613811, "learning_rate": 4.48481545341517e-06, "loss": 0.071, "step": 10108 }, { "epoch": 0.7870371271470975, "grad_norm": 0.43592793503843924, "learning_rate": 4.481665962726263e-06, "loss": 0.0364, "step": 10109 }, { "epoch": 0.7871149822393071, "grad_norm": 0.43316090383940004, "learning_rate": 4.478517438756624e-06, "loss": 0.0389, "step": 10110 }, { "epoch": 0.7871928373315167, "grad_norm": 0.5388014157337224, "learning_rate": 4.4753698817023875e-06, "loss": 0.0549, "step": 10111 }, { "epoch": 0.7872706924237264, "grad_norm": 0.5082862276041233, "learning_rate": 4.4722232917596454e-06, "loss": 0.0629, "step": 10112 }, { "epoch": 0.787348547515936, "grad_norm": 0.4362135759383822, "learning_rate": 4.469077669124397e-06, "loss": 0.0504, "step": 10113 }, { "epoch": 0.7874264026081456, "grad_norm": 0.46758987190413587, "learning_rate": 4.465933013992612e-06, "loss": 0.0496, "step": 10114 }, { "epoch": 0.7875042577003553, "grad_norm": 0.4963713652071265, "learning_rate": 4.462789326560182e-06, "loss": 0.0541, "step": 10115 }, { "epoch": 0.7875821127925648, "grad_norm": 0.4986564473850128, "learning_rate": 4.459646607022938e-06, "loss": 0.0522, "step": 10116 }, { "epoch": 0.7876599678847744, "grad_norm": 0.40919850670930863, "learning_rate": 4.4565048555766735e-06, "loss": 0.037, "step": 10117 }, { "epoch": 0.7877378229769841, "grad_norm": 0.46119070935791096, "learning_rate": 4.453364072417092e-06, "loss": 0.0444, "step": 10118 }, { "epoch": 0.7878156780691937, "grad_norm": 0.5659518240755159, "learning_rate": 4.450224257739855e-06, "loss": 0.0676, "step": 10119 }, { "epoch": 0.7878935331614033, "grad_norm": 0.4019172670401179, "learning_rate": 4.447085411740557e-06, "loss": 0.0337, "step": 10120 }, { "epoch": 0.787971388253613, "grad_norm": 0.3463811267993297, "learning_rate": 4.443947534614732e-06, "loss": 0.0278, "step": 10121 }, { "epoch": 0.7880492433458226, "grad_norm": 0.43062147020719677, "learning_rate": 4.440810626557857e-06, "loss": 0.0437, "step": 10122 }, { "epoch": 0.7881270984380322, "grad_norm": 0.48691904582516665, "learning_rate": 4.4376746877653435e-06, "loss": 0.0427, "step": 10123 }, { "epoch": 0.7882049535302419, "grad_norm": 0.4683576043931745, "learning_rate": 4.434539718432547e-06, "loss": 0.0519, "step": 10124 }, { "epoch": 0.7882828086224515, "grad_norm": 0.4403738663307614, "learning_rate": 4.4314057187547625e-06, "loss": 0.0476, "step": 10125 }, { "epoch": 0.7883606637146611, "grad_norm": 0.4549911943934089, "learning_rate": 4.428272688927218e-06, "loss": 0.0467, "step": 10126 }, { "epoch": 0.7884385188068707, "grad_norm": 0.4622172274251161, "learning_rate": 4.425140629145097e-06, "loss": 0.0476, "step": 10127 }, { "epoch": 0.7885163738990804, "grad_norm": 0.43931662187572135, "learning_rate": 4.422009539603507e-06, "loss": 0.0449, "step": 10128 }, { "epoch": 0.78859422899129, "grad_norm": 0.4821976640505294, "learning_rate": 4.418879420497497e-06, "loss": 0.0593, "step": 10129 }, { "epoch": 0.7886720840834995, "grad_norm": 0.4871246428159997, "learning_rate": 4.415750272022064e-06, "loss": 0.0553, "step": 10130 }, { "epoch": 0.7887499391757092, "grad_norm": 0.49281102234087304, "learning_rate": 4.412622094372136e-06, "loss": 0.0524, "step": 10131 }, { "epoch": 0.7888277942679188, "grad_norm": 0.5185162455030492, "learning_rate": 4.409494887742585e-06, "loss": 0.0503, "step": 10132 }, { "epoch": 0.7889056493601284, "grad_norm": 0.39167114099100026, "learning_rate": 4.4063686523282186e-06, "loss": 0.0335, "step": 10133 }, { "epoch": 0.7889835044523381, "grad_norm": 0.5354584451959443, "learning_rate": 4.40324338832379e-06, "loss": 0.0501, "step": 10134 }, { "epoch": 0.7890613595445477, "grad_norm": 0.4991286929818512, "learning_rate": 4.400119095923987e-06, "loss": 0.0616, "step": 10135 }, { "epoch": 0.7891392146367573, "grad_norm": 0.4806222134745559, "learning_rate": 4.396995775323438e-06, "loss": 0.0478, "step": 10136 }, { "epoch": 0.789217069728967, "grad_norm": 0.46102757599783883, "learning_rate": 4.393873426716708e-06, "loss": 0.0414, "step": 10137 }, { "epoch": 0.7892949248211766, "grad_norm": 0.4012728954889305, "learning_rate": 4.390752050298312e-06, "loss": 0.0376, "step": 10138 }, { "epoch": 0.7893727799133862, "grad_norm": 0.4613955651459853, "learning_rate": 4.387631646262694e-06, "loss": 0.0346, "step": 10139 }, { "epoch": 0.7894506350055959, "grad_norm": 0.46179803765380373, "learning_rate": 4.38451221480424e-06, "loss": 0.0462, "step": 10140 }, { "epoch": 0.7895284900978055, "grad_norm": 0.4134058849068417, "learning_rate": 4.381393756117273e-06, "loss": 0.0488, "step": 10141 }, { "epoch": 0.789606345190015, "grad_norm": 0.467349966840717, "learning_rate": 4.378276270396065e-06, "loss": 0.0368, "step": 10142 }, { "epoch": 0.7896842002822247, "grad_norm": 0.46047252635432784, "learning_rate": 4.375159757834815e-06, "loss": 0.0464, "step": 10143 }, { "epoch": 0.7897620553744343, "grad_norm": 0.41323669816115793, "learning_rate": 4.3720442186276666e-06, "loss": 0.039, "step": 10144 }, { "epoch": 0.7898399104666439, "grad_norm": 0.4251260206215758, "learning_rate": 4.368929652968708e-06, "loss": 0.0385, "step": 10145 }, { "epoch": 0.7899177655588536, "grad_norm": 0.6268548171103736, "learning_rate": 4.36581606105196e-06, "loss": 0.0795, "step": 10146 }, { "epoch": 0.7899956206510632, "grad_norm": 0.38268118982673716, "learning_rate": 4.3627034430713766e-06, "loss": 0.0318, "step": 10147 }, { "epoch": 0.7900734757432728, "grad_norm": 0.4207031949206848, "learning_rate": 4.359591799220875e-06, "loss": 0.028, "step": 10148 }, { "epoch": 0.7901513308354825, "grad_norm": 0.5764127640875216, "learning_rate": 4.356481129694285e-06, "loss": 0.061, "step": 10149 }, { "epoch": 0.7902291859276921, "grad_norm": 0.46158751823960115, "learning_rate": 4.353371434685392e-06, "loss": 0.0472, "step": 10150 }, { "epoch": 0.7902291859276921, "eval_loss": 0.0061314948834478855, "eval_runtime": 162.5519, "eval_samples_per_second": 17.717, "eval_steps_per_second": 0.634, "step": 10150 }, { "epoch": 0.7903070410199017, "grad_norm": 0.4166737719109973, "learning_rate": 4.350262714387918e-06, "loss": 0.0333, "step": 10151 }, { "epoch": 0.7903848961121114, "grad_norm": 0.6340248773034339, "learning_rate": 4.347154968995506e-06, "loss": 0.0739, "step": 10152 }, { "epoch": 0.790462751204321, "grad_norm": 0.4787742256929467, "learning_rate": 4.344048198701769e-06, "loss": 0.0459, "step": 10153 }, { "epoch": 0.7905406062965306, "grad_norm": 0.47457541710178697, "learning_rate": 4.340942403700241e-06, "loss": 0.044, "step": 10154 }, { "epoch": 0.7906184613887403, "grad_norm": 0.4761575959512971, "learning_rate": 4.337837584184394e-06, "loss": 0.0534, "step": 10155 }, { "epoch": 0.7906963164809498, "grad_norm": 0.3610471945311035, "learning_rate": 4.33473374034765e-06, "loss": 0.0287, "step": 10156 }, { "epoch": 0.7907741715731594, "grad_norm": 0.5099127332807171, "learning_rate": 4.331630872383355e-06, "loss": 0.0675, "step": 10157 }, { "epoch": 0.7908520266653691, "grad_norm": 0.5706906099226092, "learning_rate": 4.328528980484814e-06, "loss": 0.0776, "step": 10158 }, { "epoch": 0.7909298817575787, "grad_norm": 0.5114489408918707, "learning_rate": 4.325428064845254e-06, "loss": 0.0652, "step": 10159 }, { "epoch": 0.7910077368497883, "grad_norm": 0.423485595157095, "learning_rate": 4.322328125657851e-06, "loss": 0.0411, "step": 10160 }, { "epoch": 0.7910855919419979, "grad_norm": 0.39227994932584037, "learning_rate": 4.319229163115717e-06, "loss": 0.0427, "step": 10161 }, { "epoch": 0.7911634470342076, "grad_norm": 0.39933187402352965, "learning_rate": 4.316131177411891e-06, "loss": 0.035, "step": 10162 }, { "epoch": 0.7912413021264172, "grad_norm": 0.5258439308482786, "learning_rate": 4.313034168739376e-06, "loss": 0.0557, "step": 10163 }, { "epoch": 0.7913191572186268, "grad_norm": 0.3040417811123004, "learning_rate": 4.309938137291096e-06, "loss": 0.0141, "step": 10164 }, { "epoch": 0.7913970123108365, "grad_norm": 0.4107502760350605, "learning_rate": 4.306843083259922e-06, "loss": 0.0356, "step": 10165 }, { "epoch": 0.7914748674030461, "grad_norm": 0.41055310830341474, "learning_rate": 4.30374900683866e-06, "loss": 0.0376, "step": 10166 }, { "epoch": 0.7915527224952557, "grad_norm": 0.5118438498833568, "learning_rate": 4.300655908220051e-06, "loss": 0.0594, "step": 10167 }, { "epoch": 0.7916305775874654, "grad_norm": 0.5287290587279326, "learning_rate": 4.297563787596788e-06, "loss": 0.0524, "step": 10168 }, { "epoch": 0.791708432679675, "grad_norm": 0.39080864689712547, "learning_rate": 4.294472645161496e-06, "loss": 0.0296, "step": 10169 }, { "epoch": 0.7917862877718845, "grad_norm": 0.4347607940212662, "learning_rate": 4.291382481106741e-06, "loss": 0.0469, "step": 10170 }, { "epoch": 0.7918641428640942, "grad_norm": 0.49846882211915233, "learning_rate": 4.288293295625012e-06, "loss": 0.0454, "step": 10171 }, { "epoch": 0.7919419979563038, "grad_norm": 0.4441134795175423, "learning_rate": 4.285205088908761e-06, "loss": 0.0389, "step": 10172 }, { "epoch": 0.7920198530485134, "grad_norm": 0.4910939118049496, "learning_rate": 4.282117861150363e-06, "loss": 0.0513, "step": 10173 }, { "epoch": 0.7920977081407231, "grad_norm": 0.4547601441147331, "learning_rate": 4.279031612542146e-06, "loss": 0.0505, "step": 10174 }, { "epoch": 0.7921755632329327, "grad_norm": 0.47522046423130737, "learning_rate": 4.275946343276362e-06, "loss": 0.0584, "step": 10175 }, { "epoch": 0.7922534183251423, "grad_norm": 0.44783105751983393, "learning_rate": 4.2728620535452144e-06, "loss": 0.0403, "step": 10176 }, { "epoch": 0.792331273417352, "grad_norm": 0.5828573345735765, "learning_rate": 4.2697787435408355e-06, "loss": 0.0617, "step": 10177 }, { "epoch": 0.7924091285095616, "grad_norm": 0.47630890409480364, "learning_rate": 4.266696413455298e-06, "loss": 0.0506, "step": 10178 }, { "epoch": 0.7924869836017712, "grad_norm": 0.41541221022422314, "learning_rate": 4.2636150634806305e-06, "loss": 0.0352, "step": 10179 }, { "epoch": 0.7925648386939809, "grad_norm": 0.4379522663538946, "learning_rate": 4.260534693808771e-06, "loss": 0.0331, "step": 10180 }, { "epoch": 0.7926426937861905, "grad_norm": 0.48739682177414906, "learning_rate": 4.257455304631619e-06, "loss": 0.0379, "step": 10181 }, { "epoch": 0.7927205488784, "grad_norm": 0.4707408913639249, "learning_rate": 4.254376896141006e-06, "loss": 0.0485, "step": 10182 }, { "epoch": 0.7927984039706097, "grad_norm": 0.4865253151324237, "learning_rate": 4.2512994685286956e-06, "loss": 0.0433, "step": 10183 }, { "epoch": 0.7928762590628193, "grad_norm": 0.4651679136139792, "learning_rate": 4.248223021986406e-06, "loss": 0.0443, "step": 10184 }, { "epoch": 0.7929541141550289, "grad_norm": 0.40894612841985084, "learning_rate": 4.245147556705785e-06, "loss": 0.0414, "step": 10185 }, { "epoch": 0.7930319692472386, "grad_norm": 0.4350236974119263, "learning_rate": 4.2420730728784165e-06, "loss": 0.0388, "step": 10186 }, { "epoch": 0.7931098243394482, "grad_norm": 0.4327645753501777, "learning_rate": 4.238999570695825e-06, "loss": 0.0376, "step": 10187 }, { "epoch": 0.7931876794316578, "grad_norm": 0.47381639393141195, "learning_rate": 4.235927050349477e-06, "loss": 0.0503, "step": 10188 }, { "epoch": 0.7932655345238675, "grad_norm": 0.473832687997573, "learning_rate": 4.232855512030778e-06, "loss": 0.0346, "step": 10189 }, { "epoch": 0.7933433896160771, "grad_norm": 0.4425502447227333, "learning_rate": 4.229784955931069e-06, "loss": 0.0429, "step": 10190 }, { "epoch": 0.7934212447082867, "grad_norm": 0.4661101124776751, "learning_rate": 4.226715382241629e-06, "loss": 0.0429, "step": 10191 }, { "epoch": 0.7934990998004964, "grad_norm": 0.5421893328885683, "learning_rate": 4.223646791153679e-06, "loss": 0.0564, "step": 10192 }, { "epoch": 0.793576954892706, "grad_norm": 0.36388471992303023, "learning_rate": 4.2205791828583755e-06, "loss": 0.0269, "step": 10193 }, { "epoch": 0.7936548099849156, "grad_norm": 0.41707360290165574, "learning_rate": 4.217512557546824e-06, "loss": 0.0344, "step": 10194 }, { "epoch": 0.7937326650771253, "grad_norm": 0.6056784927092924, "learning_rate": 4.214446915410053e-06, "loss": 0.0755, "step": 10195 }, { "epoch": 0.7938105201693348, "grad_norm": 0.6821305397529484, "learning_rate": 4.211382256639042e-06, "loss": 0.0814, "step": 10196 }, { "epoch": 0.7938883752615444, "grad_norm": 0.4778870960855294, "learning_rate": 4.208318581424704e-06, "loss": 0.0524, "step": 10197 }, { "epoch": 0.793966230353754, "grad_norm": 0.4530183513823952, "learning_rate": 4.205255889957891e-06, "loss": 0.0329, "step": 10198 }, { "epoch": 0.7940440854459637, "grad_norm": 0.4705896576956278, "learning_rate": 4.202194182429395e-06, "loss": 0.0424, "step": 10199 }, { "epoch": 0.7941219405381733, "grad_norm": 0.4140033836544804, "learning_rate": 4.199133459029945e-06, "loss": 0.0367, "step": 10200 }, { "epoch": 0.7941219405381733, "eval_loss": 0.0061164298094809055, "eval_runtime": 162.3206, "eval_samples_per_second": 17.743, "eval_steps_per_second": 0.635, "step": 10200 }, { "epoch": 0.7941997956303829, "grad_norm": 0.33782132518742125, "learning_rate": 4.196073719950209e-06, "loss": 0.0235, "step": 10201 }, { "epoch": 0.7942776507225926, "grad_norm": 0.49327696154734363, "learning_rate": 4.193014965380795e-06, "loss": 0.0482, "step": 10202 }, { "epoch": 0.7943555058148022, "grad_norm": 0.4125257811642058, "learning_rate": 4.189957195512244e-06, "loss": 0.0384, "step": 10203 }, { "epoch": 0.7944333609070118, "grad_norm": 0.44349172043402185, "learning_rate": 4.1869004105350505e-06, "loss": 0.0453, "step": 10204 }, { "epoch": 0.7945112159992215, "grad_norm": 0.3742563521604884, "learning_rate": 4.1838446106396356e-06, "loss": 0.0253, "step": 10205 }, { "epoch": 0.7945890710914311, "grad_norm": 0.5136016737442972, "learning_rate": 4.180789796016358e-06, "loss": 0.0426, "step": 10206 }, { "epoch": 0.7946669261836407, "grad_norm": 0.43276386692924806, "learning_rate": 4.177735966855519e-06, "loss": 0.0339, "step": 10207 }, { "epoch": 0.7947447812758504, "grad_norm": 0.5274145899384142, "learning_rate": 4.174683123347358e-06, "loss": 0.0684, "step": 10208 }, { "epoch": 0.7948226363680599, "grad_norm": 0.43073296497065294, "learning_rate": 4.171631265682052e-06, "loss": 0.0317, "step": 10209 }, { "epoch": 0.7949004914602695, "grad_norm": 0.46349806623641376, "learning_rate": 4.168580394049719e-06, "loss": 0.0375, "step": 10210 }, { "epoch": 0.7949783465524792, "grad_norm": 0.4371912651182651, "learning_rate": 4.165530508640414e-06, "loss": 0.0385, "step": 10211 }, { "epoch": 0.7950562016446888, "grad_norm": 0.5455706003849718, "learning_rate": 4.162481609644129e-06, "loss": 0.0549, "step": 10212 }, { "epoch": 0.7951340567368984, "grad_norm": 0.45976358172535986, "learning_rate": 4.159433697250796e-06, "loss": 0.0497, "step": 10213 }, { "epoch": 0.7952119118291081, "grad_norm": 0.5782516791452651, "learning_rate": 4.1563867716502825e-06, "loss": 0.0612, "step": 10214 }, { "epoch": 0.7952897669213177, "grad_norm": 0.44124851665101394, "learning_rate": 4.1533408330324065e-06, "loss": 0.0464, "step": 10215 }, { "epoch": 0.7953676220135273, "grad_norm": 0.39459510903995, "learning_rate": 4.15029588158691e-06, "loss": 0.0263, "step": 10216 }, { "epoch": 0.795445477105737, "grad_norm": 0.5029799479777364, "learning_rate": 4.147251917503481e-06, "loss": 0.0621, "step": 10217 }, { "epoch": 0.7955233321979466, "grad_norm": 0.5012652113125813, "learning_rate": 4.144208940971748e-06, "loss": 0.0478, "step": 10218 }, { "epoch": 0.7956011872901562, "grad_norm": 0.38094963762451256, "learning_rate": 4.141166952181257e-06, "loss": 0.0249, "step": 10219 }, { "epoch": 0.7956790423823659, "grad_norm": 0.48923511433800354, "learning_rate": 4.13812595132153e-06, "loss": 0.0546, "step": 10220 }, { "epoch": 0.7957568974745755, "grad_norm": 0.5117341263437525, "learning_rate": 4.135085938581997e-06, "loss": 0.0478, "step": 10221 }, { "epoch": 0.795834752566785, "grad_norm": 0.5176115077282205, "learning_rate": 4.132046914152039e-06, "loss": 0.0494, "step": 10222 }, { "epoch": 0.7959126076589947, "grad_norm": 0.5136324698633243, "learning_rate": 4.1290088782209706e-06, "loss": 0.0599, "step": 10223 }, { "epoch": 0.7959904627512043, "grad_norm": 0.5767965095736453, "learning_rate": 4.125971830978046e-06, "loss": 0.0584, "step": 10224 }, { "epoch": 0.7960683178434139, "grad_norm": 0.4109653383202394, "learning_rate": 4.122935772612466e-06, "loss": 0.0324, "step": 10225 }, { "epoch": 0.7961461729356236, "grad_norm": 0.410839783639755, "learning_rate": 4.119900703313358e-06, "loss": 0.0312, "step": 10226 }, { "epoch": 0.7962240280278332, "grad_norm": 0.36377682870875333, "learning_rate": 4.1168666232697996e-06, "loss": 0.0242, "step": 10227 }, { "epoch": 0.7963018831200428, "grad_norm": 0.4547019309102538, "learning_rate": 4.113833532670785e-06, "loss": 0.0427, "step": 10228 }, { "epoch": 0.7963797382122525, "grad_norm": 0.5781491789057727, "learning_rate": 4.110801431705269e-06, "loss": 0.0703, "step": 10229 }, { "epoch": 0.7964575933044621, "grad_norm": 0.46274409958241147, "learning_rate": 4.107770320562141e-06, "loss": 0.0414, "step": 10230 }, { "epoch": 0.7965354483966717, "grad_norm": 0.34698629145227355, "learning_rate": 4.104740199430221e-06, "loss": 0.0213, "step": 10231 }, { "epoch": 0.7966133034888814, "grad_norm": 0.4955718662927833, "learning_rate": 4.101711068498273e-06, "loss": 0.0459, "step": 10232 }, { "epoch": 0.796691158581091, "grad_norm": 0.48109531198431865, "learning_rate": 4.098682927954998e-06, "loss": 0.0443, "step": 10233 }, { "epoch": 0.7967690136733006, "grad_norm": 0.5254690637022533, "learning_rate": 4.0956557779890274e-06, "loss": 0.0577, "step": 10234 }, { "epoch": 0.7968468687655101, "grad_norm": 0.5256284144971595, "learning_rate": 4.09262961878895e-06, "loss": 0.0542, "step": 10235 }, { "epoch": 0.7969247238577198, "grad_norm": 0.5113695499535383, "learning_rate": 4.089604450543281e-06, "loss": 0.0667, "step": 10236 }, { "epoch": 0.7970025789499294, "grad_norm": 0.4606925653123207, "learning_rate": 4.0865802734404655e-06, "loss": 0.0386, "step": 10237 }, { "epoch": 0.797080434042139, "grad_norm": 0.43835745448864616, "learning_rate": 4.083557087668899e-06, "loss": 0.0432, "step": 10238 }, { "epoch": 0.7971582891343487, "grad_norm": 0.45777045771519137, "learning_rate": 4.080534893416905e-06, "loss": 0.045, "step": 10239 }, { "epoch": 0.7972361442265583, "grad_norm": 0.39479702664393096, "learning_rate": 4.077513690872766e-06, "loss": 0.0354, "step": 10240 }, { "epoch": 0.7973139993187679, "grad_norm": 0.44452164944163525, "learning_rate": 4.074493480224682e-06, "loss": 0.0378, "step": 10241 }, { "epoch": 0.7973918544109776, "grad_norm": 0.3384589393878127, "learning_rate": 4.071474261660797e-06, "loss": 0.0187, "step": 10242 }, { "epoch": 0.7974697095031872, "grad_norm": 0.44668408903006906, "learning_rate": 4.0684560353691945e-06, "loss": 0.0578, "step": 10243 }, { "epoch": 0.7975475645953968, "grad_norm": 0.5346986156111201, "learning_rate": 4.065438801537891e-06, "loss": 0.0739, "step": 10244 }, { "epoch": 0.7976254196876065, "grad_norm": 0.4497586961264135, "learning_rate": 4.062422560354862e-06, "loss": 0.0451, "step": 10245 }, { "epoch": 0.7977032747798161, "grad_norm": 0.48885488164932295, "learning_rate": 4.059407312007988e-06, "loss": 0.0556, "step": 10246 }, { "epoch": 0.7977811298720257, "grad_norm": 0.420710667115474, "learning_rate": 4.0563930566851125e-06, "loss": 0.0352, "step": 10247 }, { "epoch": 0.7978589849642354, "grad_norm": 0.4175514032052673, "learning_rate": 4.053379794574008e-06, "loss": 0.0375, "step": 10248 }, { "epoch": 0.7979368400564449, "grad_norm": 0.5606526191393646, "learning_rate": 4.050367525862386e-06, "loss": 0.0701, "step": 10249 }, { "epoch": 0.7980146951486545, "grad_norm": 0.48216778671891075, "learning_rate": 4.04735625073789e-06, "loss": 0.0488, "step": 10250 }, { "epoch": 0.7980146951486545, "eval_loss": 0.00602651247754693, "eval_runtime": 161.9455, "eval_samples_per_second": 17.784, "eval_steps_per_second": 0.636, "step": 10250 }, { "epoch": 0.7980925502408642, "grad_norm": 0.4105879935361619, "learning_rate": 4.044345969388124e-06, "loss": 0.0404, "step": 10251 }, { "epoch": 0.7981704053330738, "grad_norm": 0.4519494134579234, "learning_rate": 4.041336682000605e-06, "loss": 0.0459, "step": 10252 }, { "epoch": 0.7982482604252834, "grad_norm": 0.47120874061885315, "learning_rate": 4.038328388762798e-06, "loss": 0.0513, "step": 10253 }, { "epoch": 0.7983261155174931, "grad_norm": 0.4727870062666133, "learning_rate": 4.035321089862106e-06, "loss": 0.057, "step": 10254 }, { "epoch": 0.7984039706097027, "grad_norm": 0.39322224023719615, "learning_rate": 4.032314785485869e-06, "loss": 0.0219, "step": 10255 }, { "epoch": 0.7984818257019123, "grad_norm": 0.4381298109109093, "learning_rate": 4.029309475821366e-06, "loss": 0.0439, "step": 10256 }, { "epoch": 0.798559680794122, "grad_norm": 0.4517922761992261, "learning_rate": 4.026305161055815e-06, "loss": 0.0482, "step": 10257 }, { "epoch": 0.7986375358863316, "grad_norm": 0.5406458347072363, "learning_rate": 4.023301841376368e-06, "loss": 0.0621, "step": 10258 }, { "epoch": 0.7987153909785412, "grad_norm": 0.4585557862245837, "learning_rate": 4.020299516970121e-06, "loss": 0.0392, "step": 10259 }, { "epoch": 0.7987932460707509, "grad_norm": 0.43235732199262017, "learning_rate": 4.017298188024097e-06, "loss": 0.0409, "step": 10260 }, { "epoch": 0.7988711011629605, "grad_norm": 0.45652256513115547, "learning_rate": 4.014297854725275e-06, "loss": 0.0353, "step": 10261 }, { "epoch": 0.79894895625517, "grad_norm": 0.39549725682513687, "learning_rate": 4.011298517260558e-06, "loss": 0.0349, "step": 10262 }, { "epoch": 0.7990268113473797, "grad_norm": 0.528501015156938, "learning_rate": 4.0083001758167885e-06, "loss": 0.0633, "step": 10263 }, { "epoch": 0.7991046664395893, "grad_norm": 0.4629302424900846, "learning_rate": 4.00530283058075e-06, "loss": 0.0537, "step": 10264 }, { "epoch": 0.7991825215317989, "grad_norm": 0.3399565047476849, "learning_rate": 4.002306481739167e-06, "loss": 0.0236, "step": 10265 }, { "epoch": 0.7992603766240086, "grad_norm": 0.4873897894256855, "learning_rate": 3.999311129478691e-06, "loss": 0.0466, "step": 10266 }, { "epoch": 0.7993382317162182, "grad_norm": 0.38383996536144666, "learning_rate": 3.996316773985922e-06, "loss": 0.0291, "step": 10267 }, { "epoch": 0.7994160868084278, "grad_norm": 0.3757149367292706, "learning_rate": 3.993323415447396e-06, "loss": 0.0344, "step": 10268 }, { "epoch": 0.7994939419006374, "grad_norm": 0.544152979256134, "learning_rate": 3.990331054049581e-06, "loss": 0.0596, "step": 10269 }, { "epoch": 0.7995717969928471, "grad_norm": 0.5401631259063417, "learning_rate": 3.9873396899788865e-06, "loss": 0.056, "step": 10270 }, { "epoch": 0.7996496520850567, "grad_norm": 0.4767998769814993, "learning_rate": 3.984349323421665e-06, "loss": 0.0497, "step": 10271 }, { "epoch": 0.7997275071772663, "grad_norm": 0.381861033430678, "learning_rate": 3.9813599545642035e-06, "loss": 0.0289, "step": 10272 }, { "epoch": 0.799805362269476, "grad_norm": 0.43567501753413607, "learning_rate": 3.978371583592721e-06, "loss": 0.0313, "step": 10273 }, { "epoch": 0.7998832173616856, "grad_norm": 0.46550310757859664, "learning_rate": 3.975384210693383e-06, "loss": 0.053, "step": 10274 }, { "epoch": 0.7999610724538951, "grad_norm": 0.4578549565354548, "learning_rate": 3.9723978360522844e-06, "loss": 0.0529, "step": 10275 }, { "epoch": 0.8000389275461048, "grad_norm": 0.5281972320543187, "learning_rate": 3.969412459855464e-06, "loss": 0.0573, "step": 10276 }, { "epoch": 0.8001167826383144, "grad_norm": 0.4224455507486815, "learning_rate": 3.966428082288896e-06, "loss": 0.0388, "step": 10277 }, { "epoch": 0.800194637730524, "grad_norm": 0.4591499507908712, "learning_rate": 3.9634447035384945e-06, "loss": 0.0435, "step": 10278 }, { "epoch": 0.8002724928227337, "grad_norm": 0.44916817366070527, "learning_rate": 3.960462323790111e-06, "loss": 0.0405, "step": 10279 }, { "epoch": 0.8003503479149433, "grad_norm": 0.44899573964012257, "learning_rate": 3.957480943229525e-06, "loss": 0.0435, "step": 10280 }, { "epoch": 0.8004282030071529, "grad_norm": 0.46685949301373497, "learning_rate": 3.954500562042478e-06, "loss": 0.0471, "step": 10281 }, { "epoch": 0.8005060580993626, "grad_norm": 1.0839890291576308, "learning_rate": 3.951521180414624e-06, "loss": 0.04, "step": 10282 }, { "epoch": 0.8005839131915722, "grad_norm": 0.4697102121230546, "learning_rate": 3.948542798531565e-06, "loss": 0.0456, "step": 10283 }, { "epoch": 0.8006617682837818, "grad_norm": 0.333598567662345, "learning_rate": 3.94556541657884e-06, "loss": 0.024, "step": 10284 }, { "epoch": 0.8007396233759915, "grad_norm": 0.5168620862407236, "learning_rate": 3.942589034741935e-06, "loss": 0.0545, "step": 10285 }, { "epoch": 0.8008174784682011, "grad_norm": 0.4547675994794709, "learning_rate": 3.939613653206245e-06, "loss": 0.049, "step": 10286 }, { "epoch": 0.8008953335604106, "grad_norm": 0.4683238270862948, "learning_rate": 3.936639272157139e-06, "loss": 0.0403, "step": 10287 }, { "epoch": 0.8009731886526203, "grad_norm": 0.5261044753073612, "learning_rate": 3.933665891779903e-06, "loss": 0.0699, "step": 10288 }, { "epoch": 0.8010510437448299, "grad_norm": 0.4120242056231258, "learning_rate": 3.930693512259763e-06, "loss": 0.0488, "step": 10289 }, { "epoch": 0.8011288988370395, "grad_norm": 0.4681986038405461, "learning_rate": 3.927722133781884e-06, "loss": 0.0463, "step": 10290 }, { "epoch": 0.8012067539292492, "grad_norm": 0.4686642618153874, "learning_rate": 3.924751756531364e-06, "loss": 0.0436, "step": 10291 }, { "epoch": 0.8012846090214588, "grad_norm": 0.46914178406155466, "learning_rate": 3.921782380693255e-06, "loss": 0.0441, "step": 10292 }, { "epoch": 0.8013624641136684, "grad_norm": 0.5333027840326394, "learning_rate": 3.9188140064525314e-06, "loss": 0.0519, "step": 10293 }, { "epoch": 0.8014403192058781, "grad_norm": 0.47037540073936346, "learning_rate": 3.915846633994112e-06, "loss": 0.0414, "step": 10294 }, { "epoch": 0.8015181742980877, "grad_norm": 0.46660659939369503, "learning_rate": 3.912880263502836e-06, "loss": 0.0454, "step": 10295 }, { "epoch": 0.8015960293902973, "grad_norm": 0.4022747280887023, "learning_rate": 3.909914895163503e-06, "loss": 0.0327, "step": 10296 }, { "epoch": 0.801673884482507, "grad_norm": 0.43588983980073026, "learning_rate": 3.906950529160842e-06, "loss": 0.0362, "step": 10297 }, { "epoch": 0.8017517395747166, "grad_norm": 0.40042583042095675, "learning_rate": 3.903987165679524e-06, "loss": 0.0348, "step": 10298 }, { "epoch": 0.8018295946669262, "grad_norm": 0.4658437472469757, "learning_rate": 3.901024804904145e-06, "loss": 0.0434, "step": 10299 }, { "epoch": 0.8019074497591359, "grad_norm": 0.4079452416932229, "learning_rate": 3.898063447019249e-06, "loss": 0.0317, "step": 10300 }, { "epoch": 0.8019074497591359, "eval_loss": 0.005887061357498169, "eval_runtime": 162.2877, "eval_samples_per_second": 17.746, "eval_steps_per_second": 0.635, "step": 10300 }, { "epoch": 0.8019853048513454, "grad_norm": 0.4196268500714818, "learning_rate": 3.8951030922093094e-06, "loss": 0.0362, "step": 10301 }, { "epoch": 0.802063159943555, "grad_norm": 0.4670062553562062, "learning_rate": 3.892143740658754e-06, "loss": 0.0492, "step": 10302 }, { "epoch": 0.8021410150357647, "grad_norm": 0.5301513625361055, "learning_rate": 3.889185392551933e-06, "loss": 0.0637, "step": 10303 }, { "epoch": 0.8022188701279743, "grad_norm": 0.4622014326773223, "learning_rate": 3.8862280480731305e-06, "loss": 0.054, "step": 10304 }, { "epoch": 0.8022967252201839, "grad_norm": 0.46363966115943567, "learning_rate": 3.883271707406577e-06, "loss": 0.0553, "step": 10305 }, { "epoch": 0.8023745803123935, "grad_norm": 0.5142102098264586, "learning_rate": 3.880316370736439e-06, "loss": 0.0483, "step": 10306 }, { "epoch": 0.8024524354046032, "grad_norm": 0.42466350774485173, "learning_rate": 3.8773620382468255e-06, "loss": 0.0335, "step": 10307 }, { "epoch": 0.8025302904968128, "grad_norm": 0.5609697551020004, "learning_rate": 3.874408710121773e-06, "loss": 0.0537, "step": 10308 }, { "epoch": 0.8026081455890224, "grad_norm": 0.4407189933453246, "learning_rate": 3.871456386545262e-06, "loss": 0.0396, "step": 10309 }, { "epoch": 0.8026860006812321, "grad_norm": 0.49608093451913815, "learning_rate": 3.86850506770121e-06, "loss": 0.0558, "step": 10310 }, { "epoch": 0.8027638557734417, "grad_norm": 0.5111568560903565, "learning_rate": 3.8655547537734575e-06, "loss": 0.0721, "step": 10311 }, { "epoch": 0.8028417108656513, "grad_norm": 0.607136232311662, "learning_rate": 3.862605444945817e-06, "loss": 0.0548, "step": 10312 }, { "epoch": 0.802919565957861, "grad_norm": 0.4672103480025686, "learning_rate": 3.859657141402003e-06, "loss": 0.0435, "step": 10313 }, { "epoch": 0.8029974210500705, "grad_norm": 0.5935536716225475, "learning_rate": 3.856709843325679e-06, "loss": 0.0607, "step": 10314 }, { "epoch": 0.8030752761422801, "grad_norm": 0.4407172939922719, "learning_rate": 3.853763550900453e-06, "loss": 0.0448, "step": 10315 }, { "epoch": 0.8031531312344898, "grad_norm": 0.522933676507536, "learning_rate": 3.850818264309857e-06, "loss": 0.0547, "step": 10316 }, { "epoch": 0.8032309863266994, "grad_norm": 0.5089028309450974, "learning_rate": 3.847873983737382e-06, "loss": 0.0409, "step": 10317 }, { "epoch": 0.803308841418909, "grad_norm": 0.4573610689082497, "learning_rate": 3.844930709366436e-06, "loss": 0.0523, "step": 10318 }, { "epoch": 0.8033866965111187, "grad_norm": 0.5016668843676527, "learning_rate": 3.841988441380371e-06, "loss": 0.0514, "step": 10319 }, { "epoch": 0.8034645516033283, "grad_norm": 0.464120394423356, "learning_rate": 3.839047179962479e-06, "loss": 0.0559, "step": 10320 }, { "epoch": 0.8035424066955379, "grad_norm": 0.5127949554654483, "learning_rate": 3.836106925295981e-06, "loss": 0.0598, "step": 10321 }, { "epoch": 0.8036202617877476, "grad_norm": 0.3713402100164315, "learning_rate": 3.833167677564045e-06, "loss": 0.0357, "step": 10322 }, { "epoch": 0.8036981168799572, "grad_norm": 0.5227542708056363, "learning_rate": 3.830229436949773e-06, "loss": 0.0772, "step": 10323 }, { "epoch": 0.8037759719721668, "grad_norm": 0.5980062753219435, "learning_rate": 3.8272922036362034e-06, "loss": 0.0785, "step": 10324 }, { "epoch": 0.8038538270643765, "grad_norm": 0.3700743785527413, "learning_rate": 3.8243559778063085e-06, "loss": 0.0288, "step": 10325 }, { "epoch": 0.8039316821565861, "grad_norm": 0.2997634989549588, "learning_rate": 3.821420759643006e-06, "loss": 0.0205, "step": 10326 }, { "epoch": 0.8040095372487956, "grad_norm": 0.518186851240776, "learning_rate": 3.818486549329139e-06, "loss": 0.0477, "step": 10327 }, { "epoch": 0.8040873923410053, "grad_norm": 0.627412520475968, "learning_rate": 3.815553347047505e-06, "loss": 0.0686, "step": 10328 }, { "epoch": 0.8041652474332149, "grad_norm": 0.4177962434369861, "learning_rate": 3.812621152980822e-06, "loss": 0.0405, "step": 10329 }, { "epoch": 0.8042431025254245, "grad_norm": 0.3584621315801935, "learning_rate": 3.809689967311756e-06, "loss": 0.0292, "step": 10330 }, { "epoch": 0.8043209576176342, "grad_norm": 0.43868522905745727, "learning_rate": 3.8067597902229024e-06, "loss": 0.0393, "step": 10331 }, { "epoch": 0.8043988127098438, "grad_norm": 0.37682196100826376, "learning_rate": 3.8038306218968003e-06, "loss": 0.0261, "step": 10332 }, { "epoch": 0.8044766678020534, "grad_norm": 0.4192037166736947, "learning_rate": 3.8009024625159207e-06, "loss": 0.0426, "step": 10333 }, { "epoch": 0.8045545228942631, "grad_norm": 0.4601271631044659, "learning_rate": 3.797975312262676e-06, "loss": 0.0458, "step": 10334 }, { "epoch": 0.8046323779864727, "grad_norm": 0.5757080349749386, "learning_rate": 3.795049171319414e-06, "loss": 0.0648, "step": 10335 }, { "epoch": 0.8047102330786823, "grad_norm": 0.5020092234361727, "learning_rate": 3.792124039868417e-06, "loss": 0.052, "step": 10336 }, { "epoch": 0.804788088170892, "grad_norm": 0.5283555771848135, "learning_rate": 3.7891999180919035e-06, "loss": 0.0575, "step": 10337 }, { "epoch": 0.8048659432631016, "grad_norm": 0.4779644349877192, "learning_rate": 3.786276806172042e-06, "loss": 0.0411, "step": 10338 }, { "epoch": 0.8049437983553112, "grad_norm": 0.40829623390581077, "learning_rate": 3.7833547042909246e-06, "loss": 0.045, "step": 10339 }, { "epoch": 0.8050216534475209, "grad_norm": 0.4424159968240295, "learning_rate": 3.7804336126305853e-06, "loss": 0.0388, "step": 10340 }, { "epoch": 0.8050995085397304, "grad_norm": 0.5273206792340411, "learning_rate": 3.777513531372994e-06, "loss": 0.0751, "step": 10341 }, { "epoch": 0.80517736363194, "grad_norm": 0.47151135237574543, "learning_rate": 3.7745944607000563e-06, "loss": 0.0455, "step": 10342 }, { "epoch": 0.8052552187241496, "grad_norm": 0.3516516049879042, "learning_rate": 3.771676400793616e-06, "loss": 0.0299, "step": 10343 }, { "epoch": 0.8053330738163593, "grad_norm": 0.47415287882625584, "learning_rate": 3.768759351835458e-06, "loss": 0.0439, "step": 10344 }, { "epoch": 0.8054109289085689, "grad_norm": 0.577656121713072, "learning_rate": 3.7658433140072983e-06, "loss": 0.0677, "step": 10345 }, { "epoch": 0.8054887840007785, "grad_norm": 0.4423577575162989, "learning_rate": 3.7629282874907923e-06, "loss": 0.0434, "step": 10346 }, { "epoch": 0.8055666390929882, "grad_norm": 0.433450951906915, "learning_rate": 3.7600142724675293e-06, "loss": 0.0372, "step": 10347 }, { "epoch": 0.8056444941851978, "grad_norm": 0.45086323478632706, "learning_rate": 3.7571012691190457e-06, "loss": 0.0451, "step": 10348 }, { "epoch": 0.8057223492774074, "grad_norm": 0.459000331961284, "learning_rate": 3.754189277626805e-06, "loss": 0.0394, "step": 10349 }, { "epoch": 0.8058002043696171, "grad_norm": 0.5311674026138441, "learning_rate": 3.7512782981722118e-06, "loss": 0.045, "step": 10350 }, { "epoch": 0.8058002043696171, "eval_loss": 0.005848470609635115, "eval_runtime": 162.5592, "eval_samples_per_second": 17.717, "eval_steps_per_second": 0.634, "step": 10350 }, { "epoch": 0.8058780594618267, "grad_norm": 0.5938863097274945, "learning_rate": 3.7483683309366025e-06, "loss": 0.0721, "step": 10351 }, { "epoch": 0.8059559145540363, "grad_norm": 0.3921305118555636, "learning_rate": 3.7454593761012594e-06, "loss": 0.033, "step": 10352 }, { "epoch": 0.806033769646246, "grad_norm": 0.3849713329842894, "learning_rate": 3.7425514338473924e-06, "loss": 0.0268, "step": 10353 }, { "epoch": 0.8061116247384555, "grad_norm": 0.49308956270813437, "learning_rate": 3.7396445043561547e-06, "loss": 0.0505, "step": 10354 }, { "epoch": 0.8061894798306651, "grad_norm": 0.5983549097407003, "learning_rate": 3.736738587808635e-06, "loss": 0.0773, "step": 10355 }, { "epoch": 0.8062673349228748, "grad_norm": 0.43152042385581363, "learning_rate": 3.733833684385855e-06, "loss": 0.0508, "step": 10356 }, { "epoch": 0.8063451900150844, "grad_norm": 0.4355682204060164, "learning_rate": 3.7309297942687763e-06, "loss": 0.0408, "step": 10357 }, { "epoch": 0.806423045107294, "grad_norm": 0.5893275357772247, "learning_rate": 3.728026917638303e-06, "loss": 0.061, "step": 10358 }, { "epoch": 0.8065009001995037, "grad_norm": 0.564365326113808, "learning_rate": 3.725125054675267e-06, "loss": 0.0594, "step": 10359 }, { "epoch": 0.8065787552917133, "grad_norm": 0.44645059578942337, "learning_rate": 3.722224205560443e-06, "loss": 0.0463, "step": 10360 }, { "epoch": 0.8066566103839229, "grad_norm": 0.5822285165483319, "learning_rate": 3.719324370474544e-06, "loss": 0.0668, "step": 10361 }, { "epoch": 0.8067344654761326, "grad_norm": 0.399786772799367, "learning_rate": 3.716425549598206e-06, "loss": 0.0236, "step": 10362 }, { "epoch": 0.8068123205683422, "grad_norm": 0.3773881020152238, "learning_rate": 3.713527743112011e-06, "loss": 0.0321, "step": 10363 }, { "epoch": 0.8068901756605518, "grad_norm": 0.3892737286682942, "learning_rate": 3.71063095119649e-06, "loss": 0.0384, "step": 10364 }, { "epoch": 0.8069680307527615, "grad_norm": 0.471915564306433, "learning_rate": 3.707735174032092e-06, "loss": 0.045, "step": 10365 }, { "epoch": 0.8070458858449711, "grad_norm": 0.42679160361198876, "learning_rate": 3.704840411799213e-06, "loss": 0.0422, "step": 10366 }, { "epoch": 0.8071237409371806, "grad_norm": 0.48673203051140596, "learning_rate": 3.7019466646781845e-06, "loss": 0.0453, "step": 10367 }, { "epoch": 0.8072015960293903, "grad_norm": 0.5169083903708757, "learning_rate": 3.6990539328492637e-06, "loss": 0.0535, "step": 10368 }, { "epoch": 0.8072794511215999, "grad_norm": 0.37873466997391897, "learning_rate": 3.696162216492667e-06, "loss": 0.0248, "step": 10369 }, { "epoch": 0.8073573062138095, "grad_norm": 0.34656294067340715, "learning_rate": 3.6932715157885347e-06, "loss": 0.0253, "step": 10370 }, { "epoch": 0.8074351613060192, "grad_norm": 0.4420106615818914, "learning_rate": 3.690381830916936e-06, "loss": 0.0351, "step": 10371 }, { "epoch": 0.8075130163982288, "grad_norm": 0.4227716738872754, "learning_rate": 3.6874931620578848e-06, "loss": 0.0296, "step": 10372 }, { "epoch": 0.8075908714904384, "grad_norm": 0.320330095263933, "learning_rate": 3.684605509391328e-06, "loss": 0.0223, "step": 10373 }, { "epoch": 0.8076687265826481, "grad_norm": 0.4912158997142032, "learning_rate": 3.6817188730971665e-06, "loss": 0.0481, "step": 10374 }, { "epoch": 0.8077465816748577, "grad_norm": 0.46048428695979116, "learning_rate": 3.678833253355214e-06, "loss": 0.0428, "step": 10375 }, { "epoch": 0.8078244367670673, "grad_norm": 0.47003288581851344, "learning_rate": 3.6759486503452357e-06, "loss": 0.0519, "step": 10376 }, { "epoch": 0.8079022918592769, "grad_norm": 0.3698345548225743, "learning_rate": 3.6730650642469257e-06, "loss": 0.0292, "step": 10377 }, { "epoch": 0.8079801469514866, "grad_norm": 0.5135481754633444, "learning_rate": 3.6701824952399134e-06, "loss": 0.0452, "step": 10378 }, { "epoch": 0.8080580020436962, "grad_norm": 0.41610753484805857, "learning_rate": 3.6673009435037844e-06, "loss": 0.0329, "step": 10379 }, { "epoch": 0.8081358571359057, "grad_norm": 0.4790548682974142, "learning_rate": 3.6644204092180325e-06, "loss": 0.0503, "step": 10380 }, { "epoch": 0.8082137122281154, "grad_norm": 0.416842914341654, "learning_rate": 3.6615408925621056e-06, "loss": 0.0423, "step": 10381 }, { "epoch": 0.808291567320325, "grad_norm": 0.4630118388156887, "learning_rate": 3.658662393715382e-06, "loss": 0.044, "step": 10382 }, { "epoch": 0.8083694224125346, "grad_norm": 0.43093898898240346, "learning_rate": 3.655784912857174e-06, "loss": 0.0486, "step": 10383 }, { "epoch": 0.8084472775047443, "grad_norm": 0.5207181197979339, "learning_rate": 3.65290845016675e-06, "loss": 0.0731, "step": 10384 }, { "epoch": 0.8085251325969539, "grad_norm": 0.4239737392836514, "learning_rate": 3.650033005823288e-06, "loss": 0.0334, "step": 10385 }, { "epoch": 0.8086029876891635, "grad_norm": 0.43109367986697406, "learning_rate": 3.6471585800059204e-06, "loss": 0.0518, "step": 10386 }, { "epoch": 0.8086808427813732, "grad_norm": 0.3568292788983624, "learning_rate": 3.6442851728937066e-06, "loss": 0.0278, "step": 10387 }, { "epoch": 0.8087586978735828, "grad_norm": 0.39826987433991967, "learning_rate": 3.641412784665648e-06, "loss": 0.0439, "step": 10388 }, { "epoch": 0.8088365529657924, "grad_norm": 0.6199928863339939, "learning_rate": 3.638541415500683e-06, "loss": 0.0868, "step": 10389 }, { "epoch": 0.8089144080580021, "grad_norm": 0.5115326849904275, "learning_rate": 3.6356710655776796e-06, "loss": 0.0532, "step": 10390 }, { "epoch": 0.8089922631502117, "grad_norm": 0.4995860987845713, "learning_rate": 3.632801735075451e-06, "loss": 0.0535, "step": 10391 }, { "epoch": 0.8090701182424213, "grad_norm": 0.4194719260544834, "learning_rate": 3.6299334241727404e-06, "loss": 0.0286, "step": 10392 }, { "epoch": 0.809147973334631, "grad_norm": 0.38783967579063233, "learning_rate": 3.627066133048227e-06, "loss": 0.0352, "step": 10393 }, { "epoch": 0.8092258284268405, "grad_norm": 0.42121463248588376, "learning_rate": 3.6241998618805397e-06, "loss": 0.0316, "step": 10394 }, { "epoch": 0.8093036835190501, "grad_norm": 0.4899186662790958, "learning_rate": 3.6213346108482285e-06, "loss": 0.0488, "step": 10395 }, { "epoch": 0.8093815386112598, "grad_norm": 0.47916386424844437, "learning_rate": 3.6184703801297814e-06, "loss": 0.0553, "step": 10396 }, { "epoch": 0.8094593937034694, "grad_norm": 0.5226289357568483, "learning_rate": 3.6156071699036323e-06, "loss": 0.0624, "step": 10397 }, { "epoch": 0.809537248795679, "grad_norm": 0.44925514942759054, "learning_rate": 3.612744980348142e-06, "loss": 0.0446, "step": 10398 }, { "epoch": 0.8096151038878887, "grad_norm": 0.38671517962210056, "learning_rate": 3.6098838116416124e-06, "loss": 0.0293, "step": 10399 }, { "epoch": 0.8096929589800983, "grad_norm": 0.4720217153606091, "learning_rate": 3.6070236639622835e-06, "loss": 0.0422, "step": 10400 }, { "epoch": 0.8096929589800983, "eval_loss": 0.00581357954069972, "eval_runtime": 161.9056, "eval_samples_per_second": 17.788, "eval_steps_per_second": 0.636, "step": 10400 }, { "epoch": 0.8097708140723079, "grad_norm": 0.5665481210191915, "learning_rate": 3.604164537488324e-06, "loss": 0.0601, "step": 10401 }, { "epoch": 0.8098486691645176, "grad_norm": 0.46179644751279697, "learning_rate": 3.6013064323978465e-06, "loss": 0.042, "step": 10402 }, { "epoch": 0.8099265242567272, "grad_norm": 0.42536135268231823, "learning_rate": 3.598449348868898e-06, "loss": 0.0506, "step": 10403 }, { "epoch": 0.8100043793489368, "grad_norm": 0.4109651004227204, "learning_rate": 3.5955932870794574e-06, "loss": 0.0372, "step": 10404 }, { "epoch": 0.8100822344411465, "grad_norm": 0.5460601388326837, "learning_rate": 3.59273824720745e-06, "loss": 0.0513, "step": 10405 }, { "epoch": 0.810160089533356, "grad_norm": 0.44263266449535915, "learning_rate": 3.589884229430731e-06, "loss": 0.0496, "step": 10406 }, { "epoch": 0.8102379446255656, "grad_norm": 0.6257409078513988, "learning_rate": 3.5870312339270897e-06, "loss": 0.0789, "step": 10407 }, { "epoch": 0.8103157997177753, "grad_norm": 0.32185715125267794, "learning_rate": 3.584179260874254e-06, "loss": 0.0214, "step": 10408 }, { "epoch": 0.8103936548099849, "grad_norm": 0.4100704403566148, "learning_rate": 3.5813283104498896e-06, "loss": 0.0445, "step": 10409 }, { "epoch": 0.8104715099021945, "grad_norm": 0.5263446727980989, "learning_rate": 3.578478382831596e-06, "loss": 0.043, "step": 10410 }, { "epoch": 0.8105493649944042, "grad_norm": 0.46685654347383804, "learning_rate": 3.5756294781969116e-06, "loss": 0.0406, "step": 10411 }, { "epoch": 0.8106272200866138, "grad_norm": 0.5326363815240251, "learning_rate": 3.5727815967233095e-06, "loss": 0.0564, "step": 10412 }, { "epoch": 0.8107050751788234, "grad_norm": 0.45501143752256923, "learning_rate": 3.569934738588201e-06, "loss": 0.0583, "step": 10413 }, { "epoch": 0.810782930271033, "grad_norm": 0.4271113375376808, "learning_rate": 3.567088903968923e-06, "loss": 0.0322, "step": 10414 }, { "epoch": 0.8108607853632427, "grad_norm": 0.40202087744294984, "learning_rate": 3.5642440930427704e-06, "loss": 0.0354, "step": 10415 }, { "epoch": 0.8109386404554523, "grad_norm": 0.365356805294541, "learning_rate": 3.5614003059869552e-06, "loss": 0.0298, "step": 10416 }, { "epoch": 0.8110164955476619, "grad_norm": 0.415919351794021, "learning_rate": 3.5585575429786335e-06, "loss": 0.0362, "step": 10417 }, { "epoch": 0.8110943506398716, "grad_norm": 0.5222920423856942, "learning_rate": 3.5557158041948945e-06, "loss": 0.054, "step": 10418 }, { "epoch": 0.8111722057320812, "grad_norm": 0.32961490334280463, "learning_rate": 3.5528750898127663e-06, "loss": 0.0208, "step": 10419 }, { "epoch": 0.8112500608242907, "grad_norm": 0.5021966577467022, "learning_rate": 3.550035400009213e-06, "loss": 0.0522, "step": 10420 }, { "epoch": 0.8113279159165004, "grad_norm": 0.3654266545726016, "learning_rate": 3.5471967349611316e-06, "loss": 0.0312, "step": 10421 }, { "epoch": 0.81140577100871, "grad_norm": 0.4316306152929971, "learning_rate": 3.5443590948453575e-06, "loss": 0.0414, "step": 10422 }, { "epoch": 0.8114836261009196, "grad_norm": 0.4568506051445443, "learning_rate": 3.541522479838666e-06, "loss": 0.043, "step": 10423 }, { "epoch": 0.8115614811931293, "grad_norm": 0.43494236767407296, "learning_rate": 3.538686890117755e-06, "loss": 0.0382, "step": 10424 }, { "epoch": 0.8116393362853389, "grad_norm": 0.3729891002653655, "learning_rate": 3.5358523258592815e-06, "loss": 0.0308, "step": 10425 }, { "epoch": 0.8117171913775485, "grad_norm": 0.36710505879264976, "learning_rate": 3.533018787239819e-06, "loss": 0.0291, "step": 10426 }, { "epoch": 0.8117950464697582, "grad_norm": 0.5064069989635893, "learning_rate": 3.530186274435887e-06, "loss": 0.0514, "step": 10427 }, { "epoch": 0.8118729015619678, "grad_norm": 0.4689616573857986, "learning_rate": 3.527354787623938e-06, "loss": 0.0432, "step": 10428 }, { "epoch": 0.8119507566541774, "grad_norm": 0.4603364781228718, "learning_rate": 3.5245243269803477e-06, "loss": 0.039, "step": 10429 }, { "epoch": 0.8120286117463871, "grad_norm": 0.4596681357436553, "learning_rate": 3.5216948926814556e-06, "loss": 0.0409, "step": 10430 }, { "epoch": 0.8121064668385967, "grad_norm": 0.4952257467605166, "learning_rate": 3.518866484903516e-06, "loss": 0.0515, "step": 10431 }, { "epoch": 0.8121843219308063, "grad_norm": 0.39432228252929163, "learning_rate": 3.516039103822728e-06, "loss": 0.0381, "step": 10432 }, { "epoch": 0.812262177023016, "grad_norm": 0.45146205645098836, "learning_rate": 3.51321274961522e-06, "loss": 0.0438, "step": 10433 }, { "epoch": 0.8123400321152255, "grad_norm": 0.5345486529196681, "learning_rate": 3.51038742245706e-06, "loss": 0.055, "step": 10434 }, { "epoch": 0.8124178872074351, "grad_norm": 0.467371020183219, "learning_rate": 3.50756312252426e-06, "loss": 0.0489, "step": 10435 }, { "epoch": 0.8124957422996448, "grad_norm": 0.5460739909676336, "learning_rate": 3.5047398499927553e-06, "loss": 0.048, "step": 10436 }, { "epoch": 0.8125735973918544, "grad_norm": 0.568409266072787, "learning_rate": 3.5019176050384296e-06, "loss": 0.069, "step": 10437 }, { "epoch": 0.812651452484064, "grad_norm": 0.422946257890497, "learning_rate": 3.4990963878370822e-06, "loss": 0.0452, "step": 10438 }, { "epoch": 0.8127293075762737, "grad_norm": 0.41520000769880155, "learning_rate": 3.49627619856447e-06, "loss": 0.0362, "step": 10439 }, { "epoch": 0.8128071626684833, "grad_norm": 0.4929451563965795, "learning_rate": 3.493457037396273e-06, "loss": 0.048, "step": 10440 }, { "epoch": 0.8128850177606929, "grad_norm": 0.4433274983217683, "learning_rate": 3.490638904508119e-06, "loss": 0.0433, "step": 10441 }, { "epoch": 0.8129628728529026, "grad_norm": 0.3893846952707035, "learning_rate": 3.4878218000755593e-06, "loss": 0.0304, "step": 10442 }, { "epoch": 0.8130407279451122, "grad_norm": 0.42146533797924396, "learning_rate": 3.4850057242740864e-06, "loss": 0.0387, "step": 10443 }, { "epoch": 0.8131185830373218, "grad_norm": 0.5452513528969476, "learning_rate": 3.4821906772791315e-06, "loss": 0.0566, "step": 10444 }, { "epoch": 0.8131964381295315, "grad_norm": 0.5551761642232567, "learning_rate": 3.4793766592660516e-06, "loss": 0.0583, "step": 10445 }, { "epoch": 0.813274293221741, "grad_norm": 0.4222345009687831, "learning_rate": 3.4765636704101626e-06, "loss": 0.0392, "step": 10446 }, { "epoch": 0.8133521483139506, "grad_norm": 0.40569152361332866, "learning_rate": 3.4737517108866835e-06, "loss": 0.041, "step": 10447 }, { "epoch": 0.8134300034061602, "grad_norm": 0.4262240362671948, "learning_rate": 3.470940780870793e-06, "loss": 0.0394, "step": 10448 }, { "epoch": 0.8135078584983699, "grad_norm": 0.41201616592815543, "learning_rate": 3.468130880537599e-06, "loss": 0.0438, "step": 10449 }, { "epoch": 0.8135857135905795, "grad_norm": 0.3556680719136533, "learning_rate": 3.4653220100621397e-06, "loss": 0.0288, "step": 10450 }, { "epoch": 0.8135857135905795, "eval_loss": 0.005803921725600958, "eval_runtime": 162.4411, "eval_samples_per_second": 17.73, "eval_steps_per_second": 0.634, "step": 10450 }, { "epoch": 0.8136635686827891, "grad_norm": 0.4170692439763938, "learning_rate": 3.462514169619404e-06, "loss": 0.0392, "step": 10451 }, { "epoch": 0.8137414237749988, "grad_norm": 0.3913792996133792, "learning_rate": 3.459707359384301e-06, "loss": 0.0298, "step": 10452 }, { "epoch": 0.8138192788672084, "grad_norm": 0.40032388576576716, "learning_rate": 3.456901579531684e-06, "loss": 0.0373, "step": 10453 }, { "epoch": 0.813897133959418, "grad_norm": 0.49688389731994226, "learning_rate": 3.45409683023634e-06, "loss": 0.0631, "step": 10454 }, { "epoch": 0.8139749890516277, "grad_norm": 0.3819123860784594, "learning_rate": 3.451293111672991e-06, "loss": 0.0246, "step": 10455 }, { "epoch": 0.8140528441438373, "grad_norm": 0.3873615689779953, "learning_rate": 3.4484904240162953e-06, "loss": 0.0378, "step": 10456 }, { "epoch": 0.8141306992360469, "grad_norm": 0.5040609513527757, "learning_rate": 3.4456887674408467e-06, "loss": 0.0639, "step": 10457 }, { "epoch": 0.8142085543282566, "grad_norm": 0.41062015675324126, "learning_rate": 3.4428881421211745e-06, "loss": 0.037, "step": 10458 }, { "epoch": 0.8142864094204662, "grad_norm": 0.5212862783180419, "learning_rate": 3.440088548231748e-06, "loss": 0.0557, "step": 10459 }, { "epoch": 0.8143642645126757, "grad_norm": 0.4574079053405525, "learning_rate": 3.437289985946961e-06, "loss": 0.0389, "step": 10460 }, { "epoch": 0.8144421196048854, "grad_norm": 0.5715515273100747, "learning_rate": 3.4344924554411608e-06, "loss": 0.0601, "step": 10461 }, { "epoch": 0.814519974697095, "grad_norm": 0.3529606005117192, "learning_rate": 3.4316959568886166e-06, "loss": 0.0263, "step": 10462 }, { "epoch": 0.8145978297893046, "grad_norm": 0.4422296425700657, "learning_rate": 3.428900490463536e-06, "loss": 0.044, "step": 10463 }, { "epoch": 0.8146756848815143, "grad_norm": 0.5025582747791133, "learning_rate": 3.4261060563400618e-06, "loss": 0.0376, "step": 10464 }, { "epoch": 0.8147535399737239, "grad_norm": 0.3580580414105159, "learning_rate": 3.423312654692279e-06, "loss": 0.0236, "step": 10465 }, { "epoch": 0.8148313950659335, "grad_norm": 0.5684421844578428, "learning_rate": 3.4205202856941976e-06, "loss": 0.0784, "step": 10466 }, { "epoch": 0.8149092501581432, "grad_norm": 0.48658535822624005, "learning_rate": 3.417728949519774e-06, "loss": 0.0458, "step": 10467 }, { "epoch": 0.8149871052503528, "grad_norm": 0.46446162926083073, "learning_rate": 3.4149386463428937e-06, "loss": 0.0451, "step": 10468 }, { "epoch": 0.8150649603425624, "grad_norm": 0.4707099671190537, "learning_rate": 3.412149376337379e-06, "loss": 0.053, "step": 10469 }, { "epoch": 0.8151428154347721, "grad_norm": 0.4158877356455512, "learning_rate": 3.409361139676981e-06, "loss": 0.0356, "step": 10470 }, { "epoch": 0.8152206705269817, "grad_norm": 0.41413451909845306, "learning_rate": 3.4065739365354087e-06, "loss": 0.0367, "step": 10471 }, { "epoch": 0.8152985256191912, "grad_norm": 0.4712291808810069, "learning_rate": 3.403787767086284e-06, "loss": 0.0378, "step": 10472 }, { "epoch": 0.815376380711401, "grad_norm": 0.36401373952573324, "learning_rate": 3.401002631503172e-06, "loss": 0.0218, "step": 10473 }, { "epoch": 0.8154542358036105, "grad_norm": 0.38176925610412077, "learning_rate": 3.3982185299595716e-06, "loss": 0.0329, "step": 10474 }, { "epoch": 0.8155320908958201, "grad_norm": 0.4614521941803624, "learning_rate": 3.3954354626289243e-06, "loss": 0.0463, "step": 10475 }, { "epoch": 0.8156099459880298, "grad_norm": 0.3887263101051977, "learning_rate": 3.392653429684598e-06, "loss": 0.0367, "step": 10476 }, { "epoch": 0.8156878010802394, "grad_norm": 0.5261953893600131, "learning_rate": 3.389872431299903e-06, "loss": 0.0622, "step": 10477 }, { "epoch": 0.815765656172449, "grad_norm": 0.4054300976939607, "learning_rate": 3.387092467648079e-06, "loss": 0.033, "step": 10478 }, { "epoch": 0.8158435112646587, "grad_norm": 0.460148809819102, "learning_rate": 3.3843135389023083e-06, "loss": 0.0467, "step": 10479 }, { "epoch": 0.8159213663568683, "grad_norm": 0.4834841440576065, "learning_rate": 3.3815356452357028e-06, "loss": 0.0463, "step": 10480 }, { "epoch": 0.8159992214490779, "grad_norm": 0.42287722406032535, "learning_rate": 3.378758786821308e-06, "loss": 0.0293, "step": 10481 }, { "epoch": 0.8160770765412876, "grad_norm": 0.3809202800174474, "learning_rate": 3.3759829638321185e-06, "loss": 0.0359, "step": 10482 }, { "epoch": 0.8161549316334972, "grad_norm": 0.46586294805161244, "learning_rate": 3.373208176441052e-06, "loss": 0.0407, "step": 10483 }, { "epoch": 0.8162327867257068, "grad_norm": 0.5604935864304537, "learning_rate": 3.370434424820963e-06, "loss": 0.0613, "step": 10484 }, { "epoch": 0.8163106418179163, "grad_norm": 0.5166599237438972, "learning_rate": 3.3676617091446473e-06, "loss": 0.042, "step": 10485 }, { "epoch": 0.816388496910126, "grad_norm": 0.37773768086077014, "learning_rate": 3.36489002958482e-06, "loss": 0.0319, "step": 10486 }, { "epoch": 0.8164663520023356, "grad_norm": 0.4085155749683989, "learning_rate": 3.3621193863141554e-06, "loss": 0.036, "step": 10487 }, { "epoch": 0.8165442070945452, "grad_norm": 0.46398068964311256, "learning_rate": 3.3593497795052475e-06, "loss": 0.0417, "step": 10488 }, { "epoch": 0.8166220621867549, "grad_norm": 0.44924697572146405, "learning_rate": 3.356581209330632e-06, "loss": 0.0348, "step": 10489 }, { "epoch": 0.8166999172789645, "grad_norm": 0.5455171315283937, "learning_rate": 3.353813675962776e-06, "loss": 0.0621, "step": 10490 }, { "epoch": 0.8167777723711741, "grad_norm": 0.3831525417916593, "learning_rate": 3.3510471795740786e-06, "loss": 0.0255, "step": 10491 }, { "epoch": 0.8168556274633838, "grad_norm": 0.5305320295733806, "learning_rate": 3.34828172033689e-06, "loss": 0.0509, "step": 10492 }, { "epoch": 0.8169334825555934, "grad_norm": 0.36664118483725117, "learning_rate": 3.3455172984234795e-06, "loss": 0.0275, "step": 10493 }, { "epoch": 0.817011337647803, "grad_norm": 0.36885033633637315, "learning_rate": 3.342753914006063e-06, "loss": 0.027, "step": 10494 }, { "epoch": 0.8170891927400127, "grad_norm": 0.46473725191060655, "learning_rate": 3.3399915672567774e-06, "loss": 0.0435, "step": 10495 }, { "epoch": 0.8171670478322223, "grad_norm": 0.4779203640976673, "learning_rate": 3.3372302583477035e-06, "loss": 0.0486, "step": 10496 }, { "epoch": 0.8172449029244319, "grad_norm": 0.40349035129191907, "learning_rate": 3.334469987450868e-06, "loss": 0.0375, "step": 10497 }, { "epoch": 0.8173227580166416, "grad_norm": 0.40490055524985047, "learning_rate": 3.3317107547382176e-06, "loss": 0.0329, "step": 10498 }, { "epoch": 0.8174006131088511, "grad_norm": 0.46253035253445945, "learning_rate": 3.3289525603816376e-06, "loss": 0.0442, "step": 10499 }, { "epoch": 0.8174784682010607, "grad_norm": 0.44589309171844815, "learning_rate": 3.3261954045529544e-06, "loss": 0.037, "step": 10500 }, { "epoch": 0.8174784682010607, "eval_loss": 0.005745603237301111, "eval_runtime": 162.5394, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.634, "step": 10500 }, { "epoch": 0.8175563232932704, "grad_norm": 0.6610377633190594, "learning_rate": 3.3234392874239172e-06, "loss": 0.0856, "step": 10501 }, { "epoch": 0.81763417838548, "grad_norm": 0.34854924575104995, "learning_rate": 3.3206842091662294e-06, "loss": 0.0236, "step": 10502 }, { "epoch": 0.8177120334776896, "grad_norm": 0.3701333323279624, "learning_rate": 3.3179301699515243e-06, "loss": 0.0311, "step": 10503 }, { "epoch": 0.8177898885698993, "grad_norm": 0.47140384634767474, "learning_rate": 3.315177169951349e-06, "loss": 0.0417, "step": 10504 }, { "epoch": 0.8178677436621089, "grad_norm": 0.4333039051559388, "learning_rate": 3.312425209337211e-06, "loss": 0.0419, "step": 10505 }, { "epoch": 0.8179455987543185, "grad_norm": 0.5239196389931623, "learning_rate": 3.30967428828054e-06, "loss": 0.0548, "step": 10506 }, { "epoch": 0.8180234538465282, "grad_norm": 0.4610462713575119, "learning_rate": 3.3069244069527116e-06, "loss": 0.0478, "step": 10507 }, { "epoch": 0.8181013089387378, "grad_norm": 0.4936889426051544, "learning_rate": 3.304175565525032e-06, "loss": 0.0481, "step": 10508 }, { "epoch": 0.8181791640309474, "grad_norm": 0.6113703995873739, "learning_rate": 3.3014277641687342e-06, "loss": 0.0767, "step": 10509 }, { "epoch": 0.8182570191231571, "grad_norm": 0.3966975244124243, "learning_rate": 3.2986810030549996e-06, "loss": 0.0346, "step": 10510 }, { "epoch": 0.8183348742153667, "grad_norm": 0.4074545664301825, "learning_rate": 3.2959352823549275e-06, "loss": 0.0364, "step": 10511 }, { "epoch": 0.8184127293075762, "grad_norm": 0.5885371412657838, "learning_rate": 3.293190602239582e-06, "loss": 0.0515, "step": 10512 }, { "epoch": 0.818490584399786, "grad_norm": 0.44588881544948084, "learning_rate": 3.2904469628799298e-06, "loss": 0.0483, "step": 10513 }, { "epoch": 0.8185684394919955, "grad_norm": 0.615281422664394, "learning_rate": 3.2877043644468887e-06, "loss": 0.0869, "step": 10514 }, { "epoch": 0.8186462945842051, "grad_norm": 0.48103713773865725, "learning_rate": 3.284962807111309e-06, "loss": 0.0481, "step": 10515 }, { "epoch": 0.8187241496764148, "grad_norm": 0.41234555496711145, "learning_rate": 3.2822222910439815e-06, "loss": 0.0317, "step": 10516 }, { "epoch": 0.8188020047686244, "grad_norm": 0.4119342987722972, "learning_rate": 3.27948281641562e-06, "loss": 0.0392, "step": 10517 }, { "epoch": 0.818879859860834, "grad_norm": 0.414127329284999, "learning_rate": 3.2767443833968884e-06, "loss": 0.0314, "step": 10518 }, { "epoch": 0.8189577149530437, "grad_norm": 0.6310048574185219, "learning_rate": 3.2740069921583785e-06, "loss": 0.0972, "step": 10519 }, { "epoch": 0.8190355700452533, "grad_norm": 0.4937732749500379, "learning_rate": 3.271270642870612e-06, "loss": 0.0474, "step": 10520 }, { "epoch": 0.8191134251374629, "grad_norm": 0.4638027464191745, "learning_rate": 3.2685353357040505e-06, "loss": 0.0522, "step": 10521 }, { "epoch": 0.8191912802296725, "grad_norm": 0.4449695744300588, "learning_rate": 3.2658010708290953e-06, "loss": 0.0455, "step": 10522 }, { "epoch": 0.8192691353218822, "grad_norm": 0.4198950172360952, "learning_rate": 3.263067848416075e-06, "loss": 0.038, "step": 10523 }, { "epoch": 0.8193469904140918, "grad_norm": 0.4018187208750796, "learning_rate": 3.260335668635257e-06, "loss": 0.0314, "step": 10524 }, { "epoch": 0.8194248455063013, "grad_norm": 0.4550579560198918, "learning_rate": 3.257604531656846e-06, "loss": 0.044, "step": 10525 }, { "epoch": 0.819502700598511, "grad_norm": 0.34766128148089226, "learning_rate": 3.2548744376509743e-06, "loss": 0.0172, "step": 10526 }, { "epoch": 0.8195805556907206, "grad_norm": 0.5334504662447387, "learning_rate": 3.252145386787711e-06, "loss": 0.0605, "step": 10527 }, { "epoch": 0.8196584107829302, "grad_norm": 0.385919508120219, "learning_rate": 3.249417379237072e-06, "loss": 0.0287, "step": 10528 }, { "epoch": 0.8197362658751399, "grad_norm": 0.38821609762760895, "learning_rate": 3.246690415168998e-06, "loss": 0.036, "step": 10529 }, { "epoch": 0.8198141209673495, "grad_norm": 0.43588122910701066, "learning_rate": 3.243964494753362e-06, "loss": 0.0399, "step": 10530 }, { "epoch": 0.8198919760595591, "grad_norm": 0.434005080491782, "learning_rate": 3.241239618159979e-06, "loss": 0.0378, "step": 10531 }, { "epoch": 0.8199698311517688, "grad_norm": 0.5605132750260499, "learning_rate": 3.2385157855585936e-06, "loss": 0.0535, "step": 10532 }, { "epoch": 0.8200476862439784, "grad_norm": 0.5868564916274445, "learning_rate": 3.2357929971188894e-06, "loss": 0.0511, "step": 10533 }, { "epoch": 0.820125541336188, "grad_norm": 0.43242817909839787, "learning_rate": 3.2330712530104823e-06, "loss": 0.0424, "step": 10534 }, { "epoch": 0.8202033964283977, "grad_norm": 0.49883857738862053, "learning_rate": 3.2303505534029256e-06, "loss": 0.0379, "step": 10535 }, { "epoch": 0.8202812515206073, "grad_norm": 0.38437610353740825, "learning_rate": 3.2276308984657032e-06, "loss": 0.0377, "step": 10536 }, { "epoch": 0.8203591066128169, "grad_norm": 0.4437410738218866, "learning_rate": 3.224912288368236e-06, "loss": 0.0414, "step": 10537 }, { "epoch": 0.8204369617050266, "grad_norm": 0.6052559196035839, "learning_rate": 3.222194723279886e-06, "loss": 0.0596, "step": 10538 }, { "epoch": 0.8205148167972361, "grad_norm": 0.48741974886180706, "learning_rate": 3.2194782033699434e-06, "loss": 0.0458, "step": 10539 }, { "epoch": 0.8205926718894457, "grad_norm": 0.6340286974830303, "learning_rate": 3.2167627288076342e-06, "loss": 0.0851, "step": 10540 }, { "epoch": 0.8206705269816554, "grad_norm": 0.49632444886191573, "learning_rate": 3.214048299762118e-06, "loss": 0.0458, "step": 10541 }, { "epoch": 0.820748382073865, "grad_norm": 0.4733852020169177, "learning_rate": 3.211334916402491e-06, "loss": 0.049, "step": 10542 }, { "epoch": 0.8208262371660746, "grad_norm": 0.4435311443056352, "learning_rate": 3.2086225788977863e-06, "loss": 0.046, "step": 10543 }, { "epoch": 0.8209040922582843, "grad_norm": 0.44108918486635385, "learning_rate": 3.2059112874169673e-06, "loss": 0.0389, "step": 10544 }, { "epoch": 0.8209819473504939, "grad_norm": 0.5462922763287417, "learning_rate": 3.203201042128936e-06, "loss": 0.0612, "step": 10545 }, { "epoch": 0.8210598024427035, "grad_norm": 0.4308488232212212, "learning_rate": 3.200491843202529e-06, "loss": 0.0484, "step": 10546 }, { "epoch": 0.8211376575349132, "grad_norm": 0.5248939267155323, "learning_rate": 3.1977836908065084e-06, "loss": 0.0514, "step": 10547 }, { "epoch": 0.8212155126271228, "grad_norm": 0.49631355843116975, "learning_rate": 3.195076585109593e-06, "loss": 0.0539, "step": 10548 }, { "epoch": 0.8212933677193324, "grad_norm": 0.5580911888659005, "learning_rate": 3.192370526280417e-06, "loss": 0.0686, "step": 10549 }, { "epoch": 0.8213712228115421, "grad_norm": 0.4780571560782084, "learning_rate": 3.189665514487552e-06, "loss": 0.0605, "step": 10550 }, { "epoch": 0.8213712228115421, "eval_loss": 0.005712499842047691, "eval_runtime": 162.3093, "eval_samples_per_second": 17.744, "eval_steps_per_second": 0.635, "step": 10550 }, { "epoch": 0.8214490779037517, "grad_norm": 0.5575166530221282, "learning_rate": 3.186961549899512e-06, "loss": 0.0621, "step": 10551 }, { "epoch": 0.8215269329959612, "grad_norm": 0.4233550497693429, "learning_rate": 3.184258632684738e-06, "loss": 0.0476, "step": 10552 }, { "epoch": 0.821604788088171, "grad_norm": 0.4353267687394514, "learning_rate": 3.181556763011611e-06, "loss": 0.0449, "step": 10553 }, { "epoch": 0.8216826431803805, "grad_norm": 0.3474650770283356, "learning_rate": 3.1788559410484443e-06, "loss": 0.0261, "step": 10554 }, { "epoch": 0.8217604982725901, "grad_norm": 0.4875562802143535, "learning_rate": 3.1761561669634866e-06, "loss": 0.0462, "step": 10555 }, { "epoch": 0.8218383533647997, "grad_norm": 0.4990394382785982, "learning_rate": 3.1734574409249184e-06, "loss": 0.0498, "step": 10556 }, { "epoch": 0.8219162084570094, "grad_norm": 0.5380692217403459, "learning_rate": 3.1707597631008613e-06, "loss": 0.0681, "step": 10557 }, { "epoch": 0.821994063549219, "grad_norm": 0.4473709363304116, "learning_rate": 3.1680631336593624e-06, "loss": 0.0518, "step": 10558 }, { "epoch": 0.8220719186414286, "grad_norm": 0.4963305164858819, "learning_rate": 3.1653675527684167e-06, "loss": 0.0462, "step": 10559 }, { "epoch": 0.8221497737336383, "grad_norm": 0.3515912723330486, "learning_rate": 3.1626730205959434e-06, "loss": 0.0307, "step": 10560 }, { "epoch": 0.8222276288258479, "grad_norm": 0.47422168097098594, "learning_rate": 3.159979537309803e-06, "loss": 0.053, "step": 10561 }, { "epoch": 0.8223054839180575, "grad_norm": 0.4535819518547944, "learning_rate": 3.1572871030777773e-06, "loss": 0.0418, "step": 10562 }, { "epoch": 0.8223833390102672, "grad_norm": 0.3918081033168262, "learning_rate": 3.154595718067592e-06, "loss": 0.04, "step": 10563 }, { "epoch": 0.8224611941024768, "grad_norm": 0.7053813202543527, "learning_rate": 3.151905382446918e-06, "loss": 0.0691, "step": 10564 }, { "epoch": 0.8225390491946863, "grad_norm": 0.4359151352400052, "learning_rate": 3.1492160963833474e-06, "loss": 0.0404, "step": 10565 }, { "epoch": 0.822616904286896, "grad_norm": 0.409711137737158, "learning_rate": 3.146527860044406e-06, "loss": 0.0377, "step": 10566 }, { "epoch": 0.8226947593791056, "grad_norm": 0.38810371414302686, "learning_rate": 3.1438406735975626e-06, "loss": 0.038, "step": 10567 }, { "epoch": 0.8227726144713152, "grad_norm": 0.4590756221541838, "learning_rate": 3.1411545372102093e-06, "loss": 0.0449, "step": 10568 }, { "epoch": 0.8228504695635249, "grad_norm": 0.561892656942832, "learning_rate": 3.138469451049688e-06, "loss": 0.0735, "step": 10569 }, { "epoch": 0.8229283246557345, "grad_norm": 0.38482679010486626, "learning_rate": 3.13578541528327e-06, "loss": 0.0302, "step": 10570 }, { "epoch": 0.8230061797479441, "grad_norm": 0.3886380450327873, "learning_rate": 3.1331024300781454e-06, "loss": 0.0295, "step": 10571 }, { "epoch": 0.8230840348401538, "grad_norm": 0.46238140569105735, "learning_rate": 3.1304204956014604e-06, "loss": 0.0511, "step": 10572 }, { "epoch": 0.8231618899323634, "grad_norm": 0.46643149358600244, "learning_rate": 3.1277396120202774e-06, "loss": 0.035, "step": 10573 }, { "epoch": 0.823239745024573, "grad_norm": 0.49772475399840505, "learning_rate": 3.1250597795016157e-06, "loss": 0.052, "step": 10574 }, { "epoch": 0.8233176001167827, "grad_norm": 0.4087074119051878, "learning_rate": 3.1223809982124107e-06, "loss": 0.0314, "step": 10575 }, { "epoch": 0.8233954552089923, "grad_norm": 0.29144076620010795, "learning_rate": 3.1197032683195385e-06, "loss": 0.0201, "step": 10576 }, { "epoch": 0.8234733103012019, "grad_norm": 0.43324255675461437, "learning_rate": 3.117026589989807e-06, "loss": 0.0461, "step": 10577 }, { "epoch": 0.8235511653934116, "grad_norm": 0.38612268361356183, "learning_rate": 3.114350963389956e-06, "loss": 0.0313, "step": 10578 }, { "epoch": 0.8236290204856211, "grad_norm": 0.5212612510050595, "learning_rate": 3.1116763886866817e-06, "loss": 0.0672, "step": 10579 }, { "epoch": 0.8237068755778307, "grad_norm": 0.4296285233952611, "learning_rate": 3.109002866046582e-06, "loss": 0.0439, "step": 10580 }, { "epoch": 0.8237847306700404, "grad_norm": 0.5672818946117968, "learning_rate": 3.1063303956362078e-06, "loss": 0.0807, "step": 10581 }, { "epoch": 0.82386258576225, "grad_norm": 0.3938545144517249, "learning_rate": 3.103658977622044e-06, "loss": 0.0319, "step": 10582 }, { "epoch": 0.8239404408544596, "grad_norm": 0.42769107116723454, "learning_rate": 3.100988612170499e-06, "loss": 0.0391, "step": 10583 }, { "epoch": 0.8240182959466693, "grad_norm": 0.464819847083324, "learning_rate": 3.0983192994479384e-06, "loss": 0.0524, "step": 10584 }, { "epoch": 0.8240961510388789, "grad_norm": 0.46122313968112527, "learning_rate": 3.0956510396206395e-06, "loss": 0.0478, "step": 10585 }, { "epoch": 0.8241740061310885, "grad_norm": 0.3715309533253294, "learning_rate": 3.092983832854823e-06, "loss": 0.0334, "step": 10586 }, { "epoch": 0.8242518612232982, "grad_norm": 0.35993710362380354, "learning_rate": 3.0903176793166435e-06, "loss": 0.0261, "step": 10587 }, { "epoch": 0.8243297163155078, "grad_norm": 0.3737047549527431, "learning_rate": 3.0876525791721913e-06, "loss": 0.0294, "step": 10588 }, { "epoch": 0.8244075714077174, "grad_norm": 0.4803143296332108, "learning_rate": 3.0849885325874872e-06, "loss": 0.0384, "step": 10589 }, { "epoch": 0.8244854264999271, "grad_norm": 0.32773960518010037, "learning_rate": 3.08232553972849e-06, "loss": 0.0209, "step": 10590 }, { "epoch": 0.8245632815921367, "grad_norm": 0.40532099311645636, "learning_rate": 3.0796636007610934e-06, "loss": 0.038, "step": 10591 }, { "epoch": 0.8246411366843462, "grad_norm": 0.47699795957481994, "learning_rate": 3.0770027158511205e-06, "loss": 0.044, "step": 10592 }, { "epoch": 0.8247189917765558, "grad_norm": 0.45360243590744065, "learning_rate": 3.074342885164332e-06, "loss": 0.0514, "step": 10593 }, { "epoch": 0.8247968468687655, "grad_norm": 0.3793490941147813, "learning_rate": 3.0716841088664217e-06, "loss": 0.0369, "step": 10594 }, { "epoch": 0.8248747019609751, "grad_norm": 0.5649440962643154, "learning_rate": 3.069026387123024e-06, "loss": 0.0605, "step": 10595 }, { "epoch": 0.8249525570531847, "grad_norm": 0.43985250583384455, "learning_rate": 3.0663697200997024e-06, "loss": 0.0299, "step": 10596 }, { "epoch": 0.8250304121453944, "grad_norm": 0.3894448493156811, "learning_rate": 3.0637141079619504e-06, "loss": 0.0256, "step": 10597 }, { "epoch": 0.825108267237604, "grad_norm": 0.5056431623968112, "learning_rate": 3.0610595508752004e-06, "loss": 0.0495, "step": 10598 }, { "epoch": 0.8251861223298136, "grad_norm": 0.5345040902952233, "learning_rate": 3.058406049004823e-06, "loss": 0.0555, "step": 10599 }, { "epoch": 0.8252639774220233, "grad_norm": 0.4218997421145535, "learning_rate": 3.0557536025161138e-06, "loss": 0.0342, "step": 10600 }, { "epoch": 0.8252639774220233, "eval_loss": 0.005684312898665667, "eval_runtime": 162.1819, "eval_samples_per_second": 17.758, "eval_steps_per_second": 0.635, "step": 10600 }, { "epoch": 0.8253418325142329, "grad_norm": 0.4346374040265075, "learning_rate": 3.0531022115743126e-06, "loss": 0.0394, "step": 10601 }, { "epoch": 0.8254196876064425, "grad_norm": 0.3967330295714579, "learning_rate": 3.0504518763445846e-06, "loss": 0.0311, "step": 10602 }, { "epoch": 0.8254975426986522, "grad_norm": 0.4820629029904678, "learning_rate": 3.0478025969920356e-06, "loss": 0.0509, "step": 10603 }, { "epoch": 0.8255753977908618, "grad_norm": 0.42414085763783316, "learning_rate": 3.045154373681698e-06, "loss": 0.0332, "step": 10604 }, { "epoch": 0.8256532528830713, "grad_norm": 0.4467853485800305, "learning_rate": 3.0425072065785512e-06, "loss": 0.0323, "step": 10605 }, { "epoch": 0.825731107975281, "grad_norm": 0.4759831333557272, "learning_rate": 3.0398610958475004e-06, "loss": 0.0343, "step": 10606 }, { "epoch": 0.8258089630674906, "grad_norm": 0.4879848986096015, "learning_rate": 3.0372160416533813e-06, "loss": 0.0434, "step": 10607 }, { "epoch": 0.8258868181597002, "grad_norm": 0.48709883610253935, "learning_rate": 3.0345720441609726e-06, "loss": 0.0465, "step": 10608 }, { "epoch": 0.8259646732519099, "grad_norm": 0.4285805471385212, "learning_rate": 3.031929103534981e-06, "loss": 0.0312, "step": 10609 }, { "epoch": 0.8260425283441195, "grad_norm": 0.5197400581051239, "learning_rate": 3.0292872199400468e-06, "loss": 0.0429, "step": 10610 }, { "epoch": 0.8261203834363291, "grad_norm": 0.4715401189518388, "learning_rate": 3.026646393540751e-06, "loss": 0.0481, "step": 10611 }, { "epoch": 0.8261982385285388, "grad_norm": 0.3540600208637552, "learning_rate": 3.0240066245016032e-06, "loss": 0.0249, "step": 10612 }, { "epoch": 0.8262760936207484, "grad_norm": 0.5152091131631106, "learning_rate": 3.02136791298705e-06, "loss": 0.0462, "step": 10613 }, { "epoch": 0.826353948712958, "grad_norm": 0.5296319425574028, "learning_rate": 3.018730259161462e-06, "loss": 0.0572, "step": 10614 }, { "epoch": 0.8264318038051677, "grad_norm": 0.3854021154141802, "learning_rate": 3.0160936631891655e-06, "loss": 0.0365, "step": 10615 }, { "epoch": 0.8265096588973773, "grad_norm": 0.2896669918239339, "learning_rate": 3.013458125234403e-06, "loss": 0.0146, "step": 10616 }, { "epoch": 0.8265875139895869, "grad_norm": 0.48742105924716395, "learning_rate": 3.0108236454613536e-06, "loss": 0.0497, "step": 10617 }, { "epoch": 0.8266653690817966, "grad_norm": 0.42118819924939155, "learning_rate": 3.0081902240341375e-06, "loss": 0.042, "step": 10618 }, { "epoch": 0.8267432241740061, "grad_norm": 0.5899833852524274, "learning_rate": 3.005557861116799e-06, "loss": 0.0637, "step": 10619 }, { "epoch": 0.8268210792662157, "grad_norm": 0.26380958932458226, "learning_rate": 3.002926556873327e-06, "loss": 0.0124, "step": 10620 }, { "epoch": 0.8268989343584254, "grad_norm": 0.4037982896375256, "learning_rate": 3.0002963114676365e-06, "loss": 0.0321, "step": 10621 }, { "epoch": 0.826976789450635, "grad_norm": 0.4199404840455864, "learning_rate": 2.9976671250635793e-06, "loss": 0.039, "step": 10622 }, { "epoch": 0.8270546445428446, "grad_norm": 0.5310081005365128, "learning_rate": 2.995038997824944e-06, "loss": 0.0675, "step": 10623 }, { "epoch": 0.8271324996350543, "grad_norm": 0.605160681599392, "learning_rate": 2.9924119299154443e-06, "loss": 0.0737, "step": 10624 }, { "epoch": 0.8272103547272639, "grad_norm": 0.4570420244838279, "learning_rate": 2.98978592149874e-06, "loss": 0.0497, "step": 10625 }, { "epoch": 0.8272882098194735, "grad_norm": 0.4076437056670379, "learning_rate": 2.987160972738421e-06, "loss": 0.0414, "step": 10626 }, { "epoch": 0.8273660649116831, "grad_norm": 0.4717212744924978, "learning_rate": 2.984537083798005e-06, "loss": 0.0512, "step": 10627 }, { "epoch": 0.8274439200038928, "grad_norm": 0.4484511736010597, "learning_rate": 2.9819142548409542e-06, "loss": 0.0466, "step": 10628 }, { "epoch": 0.8275217750961024, "grad_norm": 0.36656742711806506, "learning_rate": 2.9792924860306428e-06, "loss": 0.03, "step": 10629 }, { "epoch": 0.827599630188312, "grad_norm": 0.4372882023617287, "learning_rate": 2.976671777530409e-06, "loss": 0.0405, "step": 10630 }, { "epoch": 0.8276774852805217, "grad_norm": 0.3000879360364885, "learning_rate": 2.9740521295035085e-06, "loss": 0.0191, "step": 10631 }, { "epoch": 0.8277553403727312, "grad_norm": 0.5528597044877983, "learning_rate": 2.9714335421131314e-06, "loss": 0.0537, "step": 10632 }, { "epoch": 0.8278331954649408, "grad_norm": 0.41665305366821087, "learning_rate": 2.968816015522402e-06, "loss": 0.0306, "step": 10633 }, { "epoch": 0.8279110505571505, "grad_norm": 0.41940445173119395, "learning_rate": 2.9661995498943817e-06, "loss": 0.0394, "step": 10634 }, { "epoch": 0.8279889056493601, "grad_norm": 0.5571501792691296, "learning_rate": 2.96358414539206e-06, "loss": 0.0639, "step": 10635 }, { "epoch": 0.8280667607415697, "grad_norm": 0.32497926421417267, "learning_rate": 2.9609698021783708e-06, "loss": 0.0208, "step": 10636 }, { "epoch": 0.8281446158337794, "grad_norm": 0.46343811753041264, "learning_rate": 2.9583565204161767e-06, "loss": 0.0408, "step": 10637 }, { "epoch": 0.828222470925989, "grad_norm": 0.5528707190367482, "learning_rate": 2.955744300268266e-06, "loss": 0.0541, "step": 10638 }, { "epoch": 0.8283003260181986, "grad_norm": 0.39996581088071786, "learning_rate": 2.9531331418973687e-06, "loss": 0.0271, "step": 10639 }, { "epoch": 0.8283781811104083, "grad_norm": 0.44899314156460857, "learning_rate": 2.9505230454661447e-06, "loss": 0.0481, "step": 10640 }, { "epoch": 0.8284560362026179, "grad_norm": 0.48580904596428687, "learning_rate": 2.9479140111372006e-06, "loss": 0.0397, "step": 10641 }, { "epoch": 0.8285338912948275, "grad_norm": 0.3279476580007802, "learning_rate": 2.9453060390730638e-06, "loss": 0.0176, "step": 10642 }, { "epoch": 0.8286117463870372, "grad_norm": 0.5111039434838618, "learning_rate": 2.9426991294361927e-06, "loss": 0.0578, "step": 10643 }, { "epoch": 0.8286896014792468, "grad_norm": 0.503998006875372, "learning_rate": 2.9400932823889914e-06, "loss": 0.0543, "step": 10644 }, { "epoch": 0.8287674565714563, "grad_norm": 0.3990805273885042, "learning_rate": 2.937488498093786e-06, "loss": 0.0398, "step": 10645 }, { "epoch": 0.828845311663666, "grad_norm": 0.5140764888242672, "learning_rate": 2.934884776712854e-06, "loss": 0.0617, "step": 10646 }, { "epoch": 0.8289231667558756, "grad_norm": 0.4583434708763054, "learning_rate": 2.932282118408385e-06, "loss": 0.0403, "step": 10647 }, { "epoch": 0.8290010218480852, "grad_norm": 0.5055364834785879, "learning_rate": 2.929680523342513e-06, "loss": 0.0432, "step": 10648 }, { "epoch": 0.8290788769402949, "grad_norm": 0.5905075452258632, "learning_rate": 2.9270799916773064e-06, "loss": 0.06, "step": 10649 }, { "epoch": 0.8291567320325045, "grad_norm": 0.5163778760812558, "learning_rate": 2.924480523574762e-06, "loss": 0.0573, "step": 10650 }, { "epoch": 0.8291567320325045, "eval_loss": 0.005624874494969845, "eval_runtime": 162.0406, "eval_samples_per_second": 17.773, "eval_steps_per_second": 0.636, "step": 10650 }, { "epoch": 0.8292345871247141, "grad_norm": 0.41957988186257256, "learning_rate": 2.9218821191968237e-06, "loss": 0.0431, "step": 10651 }, { "epoch": 0.8293124422169238, "grad_norm": 0.36391522058054493, "learning_rate": 2.919284778705356e-06, "loss": 0.0303, "step": 10652 }, { "epoch": 0.8293902973091334, "grad_norm": 0.5137309888357091, "learning_rate": 2.916688502262159e-06, "loss": 0.0446, "step": 10653 }, { "epoch": 0.829468152401343, "grad_norm": 0.35163403332636933, "learning_rate": 2.91409329002897e-06, "loss": 0.0268, "step": 10654 }, { "epoch": 0.8295460074935527, "grad_norm": 0.47079945414856444, "learning_rate": 2.9114991421674576e-06, "loss": 0.0441, "step": 10655 }, { "epoch": 0.8296238625857623, "grad_norm": 0.35750887014108884, "learning_rate": 2.908906058839227e-06, "loss": 0.0303, "step": 10656 }, { "epoch": 0.8297017176779719, "grad_norm": 0.4288895224437452, "learning_rate": 2.906314040205813e-06, "loss": 0.0314, "step": 10657 }, { "epoch": 0.8297795727701816, "grad_norm": 0.6267300272601708, "learning_rate": 2.903723086428687e-06, "loss": 0.078, "step": 10658 }, { "epoch": 0.8298574278623911, "grad_norm": 0.5357130755327532, "learning_rate": 2.9011331976692527e-06, "loss": 0.0582, "step": 10659 }, { "epoch": 0.8299352829546007, "grad_norm": 0.33840925565486984, "learning_rate": 2.898544374088843e-06, "loss": 0.0218, "step": 10660 }, { "epoch": 0.8300131380468104, "grad_norm": 0.4348293980088264, "learning_rate": 2.8959566158487407e-06, "loss": 0.0355, "step": 10661 }, { "epoch": 0.83009099313902, "grad_norm": 0.414983967355789, "learning_rate": 2.893369923110145e-06, "loss": 0.0358, "step": 10662 }, { "epoch": 0.8301688482312296, "grad_norm": 0.42773483690621106, "learning_rate": 2.890784296034195e-06, "loss": 0.0352, "step": 10663 }, { "epoch": 0.8302467033234392, "grad_norm": 0.49902817657379084, "learning_rate": 2.8881997347819624e-06, "loss": 0.0513, "step": 10664 }, { "epoch": 0.8303245584156489, "grad_norm": 0.47872397304382486, "learning_rate": 2.885616239514455e-06, "loss": 0.0466, "step": 10665 }, { "epoch": 0.8304024135078585, "grad_norm": 0.5205659527547718, "learning_rate": 2.88303381039261e-06, "loss": 0.0468, "step": 10666 }, { "epoch": 0.8304802686000681, "grad_norm": 0.58145569274393, "learning_rate": 2.8804524475773023e-06, "loss": 0.0531, "step": 10667 }, { "epoch": 0.8305581236922778, "grad_norm": 0.4557853338729948, "learning_rate": 2.8778721512293374e-06, "loss": 0.0529, "step": 10668 }, { "epoch": 0.8306359787844874, "grad_norm": 0.44834653999243956, "learning_rate": 2.875292921509456e-06, "loss": 0.0354, "step": 10669 }, { "epoch": 0.830713833876697, "grad_norm": 0.4160204867377998, "learning_rate": 2.872714758578334e-06, "loss": 0.0303, "step": 10670 }, { "epoch": 0.8307916889689066, "grad_norm": 0.4070697244059394, "learning_rate": 2.870137662596573e-06, "loss": 0.036, "step": 10671 }, { "epoch": 0.8308695440611162, "grad_norm": 0.34132932095204394, "learning_rate": 2.8675616337247205e-06, "loss": 0.02, "step": 10672 }, { "epoch": 0.8309473991533258, "grad_norm": 0.38046233418550285, "learning_rate": 2.864986672123249e-06, "loss": 0.0371, "step": 10673 }, { "epoch": 0.8310252542455355, "grad_norm": 0.3990445499360452, "learning_rate": 2.8624127779525677e-06, "loss": 0.0323, "step": 10674 }, { "epoch": 0.8311031093377451, "grad_norm": 0.5792581545500964, "learning_rate": 2.859839951373016e-06, "loss": 0.0602, "step": 10675 }, { "epoch": 0.8311809644299547, "grad_norm": 0.42032525051104597, "learning_rate": 2.8572681925448687e-06, "loss": 0.0398, "step": 10676 }, { "epoch": 0.8312588195221644, "grad_norm": 0.3728526536370798, "learning_rate": 2.8546975016283363e-06, "loss": 0.0278, "step": 10677 }, { "epoch": 0.831336674614374, "grad_norm": 0.40975984318513897, "learning_rate": 2.852127878783557e-06, "loss": 0.0326, "step": 10678 }, { "epoch": 0.8314145297065836, "grad_norm": 0.6103973431496368, "learning_rate": 2.849559324170612e-06, "loss": 0.07, "step": 10679 }, { "epoch": 0.8314923847987933, "grad_norm": 0.4947101356203644, "learning_rate": 2.8469918379495044e-06, "loss": 0.053, "step": 10680 }, { "epoch": 0.8315702398910029, "grad_norm": 0.41098261524359975, "learning_rate": 2.8444254202801745e-06, "loss": 0.0366, "step": 10681 }, { "epoch": 0.8316480949832125, "grad_norm": 0.5128231788992245, "learning_rate": 2.8418600713225086e-06, "loss": 0.0548, "step": 10682 }, { "epoch": 0.8317259500754222, "grad_norm": 0.3829390641774846, "learning_rate": 2.8392957912363096e-06, "loss": 0.034, "step": 10683 }, { "epoch": 0.8318038051676317, "grad_norm": 0.3349448764984951, "learning_rate": 2.8367325801813204e-06, "loss": 0.0233, "step": 10684 }, { "epoch": 0.8318816602598413, "grad_norm": 0.4503741114112679, "learning_rate": 2.8341704383172164e-06, "loss": 0.0435, "step": 10685 }, { "epoch": 0.831959515352051, "grad_norm": 0.421848707140361, "learning_rate": 2.831609365803607e-06, "loss": 0.0398, "step": 10686 }, { "epoch": 0.8320373704442606, "grad_norm": 0.3461473743810433, "learning_rate": 2.8290493628000383e-06, "loss": 0.0244, "step": 10687 }, { "epoch": 0.8321152255364702, "grad_norm": 0.4078196948208103, "learning_rate": 2.8264904294659802e-06, "loss": 0.0376, "step": 10688 }, { "epoch": 0.8321930806286799, "grad_norm": 0.5519556668021451, "learning_rate": 2.8239325659608475e-06, "loss": 0.0564, "step": 10689 }, { "epoch": 0.8322709357208895, "grad_norm": 0.4286988790814793, "learning_rate": 2.8213757724439816e-06, "loss": 0.0485, "step": 10690 }, { "epoch": 0.8323487908130991, "grad_norm": 0.5356874487800153, "learning_rate": 2.818820049074653e-06, "loss": 0.0573, "step": 10691 }, { "epoch": 0.8324266459053088, "grad_norm": 0.4237167718113505, "learning_rate": 2.816265396012081e-06, "loss": 0.0254, "step": 10692 }, { "epoch": 0.8325045009975184, "grad_norm": 0.4721292974015748, "learning_rate": 2.813711813415405e-06, "loss": 0.0456, "step": 10693 }, { "epoch": 0.832582356089728, "grad_norm": 0.48616138847614154, "learning_rate": 2.8111593014437e-06, "loss": 0.0491, "step": 10694 }, { "epoch": 0.8326602111819377, "grad_norm": 0.42519043525172373, "learning_rate": 2.808607860255981e-06, "loss": 0.0358, "step": 10695 }, { "epoch": 0.8327380662741473, "grad_norm": 0.4171788515414553, "learning_rate": 2.806057490011176e-06, "loss": 0.0361, "step": 10696 }, { "epoch": 0.8328159213663568, "grad_norm": 0.5139551364879551, "learning_rate": 2.803508190868176e-06, "loss": 0.0492, "step": 10697 }, { "epoch": 0.8328937764585665, "grad_norm": 0.4851557467518774, "learning_rate": 2.800959962985783e-06, "loss": 0.0361, "step": 10698 }, { "epoch": 0.8329716315507761, "grad_norm": 0.43648011997125946, "learning_rate": 2.7984128065227433e-06, "loss": 0.0436, "step": 10699 }, { "epoch": 0.8330494866429857, "grad_norm": 0.40170167519792666, "learning_rate": 2.795866721637732e-06, "loss": 0.0355, "step": 10700 }, { "epoch": 0.8330494866429857, "eval_loss": 0.005542097147554159, "eval_runtime": 162.1324, "eval_samples_per_second": 17.763, "eval_steps_per_second": 0.635, "step": 10700 }, { "epoch": 0.8331273417351953, "grad_norm": 0.4389687542204981, "learning_rate": 2.7933217084893517e-06, "loss": 0.039, "step": 10701 }, { "epoch": 0.833205196827405, "grad_norm": 0.39194503492388394, "learning_rate": 2.790777767236155e-06, "loss": 0.0412, "step": 10702 }, { "epoch": 0.8332830519196146, "grad_norm": 0.360156612153526, "learning_rate": 2.7882348980366126e-06, "loss": 0.0334, "step": 10703 }, { "epoch": 0.8333609070118242, "grad_norm": 0.5343592053236599, "learning_rate": 2.7856931010491382e-06, "loss": 0.0466, "step": 10704 }, { "epoch": 0.8334387621040339, "grad_norm": 0.44019622954245263, "learning_rate": 2.7831523764320655e-06, "loss": 0.049, "step": 10705 }, { "epoch": 0.8335166171962435, "grad_norm": 0.3935355523010842, "learning_rate": 2.7806127243436676e-06, "loss": 0.0215, "step": 10706 }, { "epoch": 0.8335944722884531, "grad_norm": 0.613047435113864, "learning_rate": 2.7780741449421644e-06, "loss": 0.0846, "step": 10707 }, { "epoch": 0.8336723273806628, "grad_norm": 0.5016855281707988, "learning_rate": 2.7755366383856897e-06, "loss": 0.0376, "step": 10708 }, { "epoch": 0.8337501824728724, "grad_norm": 0.6147995957943589, "learning_rate": 2.7730002048323237e-06, "loss": 0.0752, "step": 10709 }, { "epoch": 0.833828037565082, "grad_norm": 0.41592505950904074, "learning_rate": 2.7704648444400683e-06, "loss": 0.0357, "step": 10710 }, { "epoch": 0.8339058926572916, "grad_norm": 0.4145045589683364, "learning_rate": 2.767930557366867e-06, "loss": 0.035, "step": 10711 }, { "epoch": 0.8339837477495012, "grad_norm": 0.4756072132482564, "learning_rate": 2.7653973437705904e-06, "loss": 0.0338, "step": 10712 }, { "epoch": 0.8340616028417108, "grad_norm": 0.46904154030179257, "learning_rate": 2.76286520380906e-06, "loss": 0.0438, "step": 10713 }, { "epoch": 0.8341394579339205, "grad_norm": 0.3873354106729824, "learning_rate": 2.7603341376399996e-06, "loss": 0.0329, "step": 10714 }, { "epoch": 0.8342173130261301, "grad_norm": 0.40602575665949203, "learning_rate": 2.757804145421088e-06, "loss": 0.0353, "step": 10715 }, { "epoch": 0.8342951681183397, "grad_norm": 0.5787347009897577, "learning_rate": 2.755275227309935e-06, "loss": 0.0583, "step": 10716 }, { "epoch": 0.8343730232105494, "grad_norm": 0.4266005567071618, "learning_rate": 2.7527473834640715e-06, "loss": 0.0342, "step": 10717 }, { "epoch": 0.834450878302759, "grad_norm": 0.47911262962786183, "learning_rate": 2.7502206140409815e-06, "loss": 0.0384, "step": 10718 }, { "epoch": 0.8345287333949686, "grad_norm": 0.3955387704142593, "learning_rate": 2.7476949191980675e-06, "loss": 0.0302, "step": 10719 }, { "epoch": 0.8346065884871783, "grad_norm": 0.4554777868458236, "learning_rate": 2.7451702990926653e-06, "loss": 0.0413, "step": 10720 }, { "epoch": 0.8346844435793879, "grad_norm": 0.4434590619916112, "learning_rate": 2.74264675388205e-06, "loss": 0.0366, "step": 10721 }, { "epoch": 0.8347622986715975, "grad_norm": 0.5020922759979639, "learning_rate": 2.7401242837234245e-06, "loss": 0.051, "step": 10722 }, { "epoch": 0.8348401537638072, "grad_norm": 0.49338319944243, "learning_rate": 2.737602888773927e-06, "loss": 0.0428, "step": 10723 }, { "epoch": 0.8349180088560167, "grad_norm": 0.48460225855291256, "learning_rate": 2.7350825691906303e-06, "loss": 0.0488, "step": 10724 }, { "epoch": 0.8349958639482263, "grad_norm": 0.4355650443776752, "learning_rate": 2.732563325130535e-06, "loss": 0.0354, "step": 10725 }, { "epoch": 0.835073719040436, "grad_norm": 0.3691464090519872, "learning_rate": 2.7300451567505826e-06, "loss": 0.0335, "step": 10726 }, { "epoch": 0.8351515741326456, "grad_norm": 0.5243827048736088, "learning_rate": 2.7275280642076365e-06, "loss": 0.0532, "step": 10727 }, { "epoch": 0.8352294292248552, "grad_norm": 0.42272479140843683, "learning_rate": 2.7250120476585083e-06, "loss": 0.039, "step": 10728 }, { "epoch": 0.8353072843170649, "grad_norm": 0.4864466838247822, "learning_rate": 2.7224971072599294e-06, "loss": 0.0453, "step": 10729 }, { "epoch": 0.8353851394092745, "grad_norm": 0.3303811400192233, "learning_rate": 2.719983243168569e-06, "loss": 0.0231, "step": 10730 }, { "epoch": 0.8354629945014841, "grad_norm": 0.4172679637688692, "learning_rate": 2.7174704555410316e-06, "loss": 0.03, "step": 10731 }, { "epoch": 0.8355408495936938, "grad_norm": 0.5121782187864365, "learning_rate": 2.7149587445338464e-06, "loss": 0.0555, "step": 10732 }, { "epoch": 0.8356187046859034, "grad_norm": 0.4910755357454618, "learning_rate": 2.712448110303487e-06, "loss": 0.0455, "step": 10733 }, { "epoch": 0.835696559778113, "grad_norm": 0.4598763429469778, "learning_rate": 2.7099385530063503e-06, "loss": 0.0425, "step": 10734 }, { "epoch": 0.8357744148703226, "grad_norm": 0.5561331116881101, "learning_rate": 2.7074300727987736e-06, "loss": 0.0589, "step": 10735 }, { "epoch": 0.8358522699625323, "grad_norm": 0.40306143455814697, "learning_rate": 2.7049226698370177e-06, "loss": 0.0312, "step": 10736 }, { "epoch": 0.8359301250547418, "grad_norm": 0.39968376009857404, "learning_rate": 2.702416344277283e-06, "loss": 0.0373, "step": 10737 }, { "epoch": 0.8360079801469514, "grad_norm": 0.45634503663452364, "learning_rate": 2.699911096275709e-06, "loss": 0.0415, "step": 10738 }, { "epoch": 0.8360858352391611, "grad_norm": 0.43505108841945006, "learning_rate": 2.697406925988355e-06, "loss": 0.0439, "step": 10739 }, { "epoch": 0.8361636903313707, "grad_norm": 0.663955774962703, "learning_rate": 2.694903833571221e-06, "loss": 0.0784, "step": 10740 }, { "epoch": 0.8362415454235803, "grad_norm": 0.42930100025762735, "learning_rate": 2.692401819180237e-06, "loss": 0.0451, "step": 10741 }, { "epoch": 0.83631940051579, "grad_norm": 0.5137292854075304, "learning_rate": 2.689900882971266e-06, "loss": 0.0514, "step": 10742 }, { "epoch": 0.8363972556079996, "grad_norm": 0.5702336878950949, "learning_rate": 2.687401025100107e-06, "loss": 0.0655, "step": 10743 }, { "epoch": 0.8364751107002092, "grad_norm": 0.3653230819680296, "learning_rate": 2.684902245722485e-06, "loss": 0.0358, "step": 10744 }, { "epoch": 0.8365529657924189, "grad_norm": 0.6000665071255956, "learning_rate": 2.682404544994066e-06, "loss": 0.044, "step": 10745 }, { "epoch": 0.8366308208846285, "grad_norm": 0.3809223866852579, "learning_rate": 2.679907923070446e-06, "loss": 0.0318, "step": 10746 }, { "epoch": 0.8367086759768381, "grad_norm": 0.35598244975381843, "learning_rate": 2.6774123801071492e-06, "loss": 0.0301, "step": 10747 }, { "epoch": 0.8367865310690478, "grad_norm": 0.5194506962914243, "learning_rate": 2.674917916259632e-06, "loss": 0.0444, "step": 10748 }, { "epoch": 0.8368643861612574, "grad_norm": 0.4743805570617809, "learning_rate": 2.6724245316832995e-06, "loss": 0.0455, "step": 10749 }, { "epoch": 0.836942241253467, "grad_norm": 0.38113470432640784, "learning_rate": 2.6699322265334714e-06, "loss": 0.0315, "step": 10750 }, { "epoch": 0.836942241253467, "eval_loss": 0.005524852778762579, "eval_runtime": 162.3297, "eval_samples_per_second": 17.742, "eval_steps_per_second": 0.635, "step": 10750 }, { "epoch": 0.8370200963456766, "grad_norm": 0.5397761385472793, "learning_rate": 2.6674410009654072e-06, "loss": 0.0588, "step": 10751 }, { "epoch": 0.8370979514378862, "grad_norm": 0.4580036940824464, "learning_rate": 2.664950855134303e-06, "loss": 0.0448, "step": 10752 }, { "epoch": 0.8371758065300958, "grad_norm": 0.5231886150291095, "learning_rate": 2.6624617891952696e-06, "loss": 0.0578, "step": 10753 }, { "epoch": 0.8372536616223055, "grad_norm": 0.4923526364892284, "learning_rate": 2.6599738033033796e-06, "loss": 0.0531, "step": 10754 }, { "epoch": 0.8373315167145151, "grad_norm": 0.62706716565989, "learning_rate": 2.6574868976136125e-06, "loss": 0.0675, "step": 10755 }, { "epoch": 0.8374093718067247, "grad_norm": 0.4148280678955825, "learning_rate": 2.6550010722808982e-06, "loss": 0.0276, "step": 10756 }, { "epoch": 0.8374872268989344, "grad_norm": 0.4303115382213679, "learning_rate": 2.6525163274600863e-06, "loss": 0.0436, "step": 10757 }, { "epoch": 0.837565081991144, "grad_norm": 0.45933950930361384, "learning_rate": 2.6500326633059636e-06, "loss": 0.0384, "step": 10758 }, { "epoch": 0.8376429370833536, "grad_norm": 0.39041391464380354, "learning_rate": 2.647550079973256e-06, "loss": 0.0372, "step": 10759 }, { "epoch": 0.8377207921755633, "grad_norm": 0.5025194798102087, "learning_rate": 2.6450685776166164e-06, "loss": 0.0521, "step": 10760 }, { "epoch": 0.8377986472677729, "grad_norm": 0.5451419568877122, "learning_rate": 2.642588156390633e-06, "loss": 0.0629, "step": 10761 }, { "epoch": 0.8378765023599825, "grad_norm": 0.3802491795943505, "learning_rate": 2.640108816449818e-06, "loss": 0.0277, "step": 10762 }, { "epoch": 0.8379543574521922, "grad_norm": 0.5212106082800761, "learning_rate": 2.637630557948618e-06, "loss": 0.0557, "step": 10763 }, { "epoch": 0.8380322125444017, "grad_norm": 0.4002135197043917, "learning_rate": 2.6351533810414288e-06, "loss": 0.0421, "step": 10764 }, { "epoch": 0.8381100676366113, "grad_norm": 0.4713689233036175, "learning_rate": 2.6326772858825633e-06, "loss": 0.0439, "step": 10765 }, { "epoch": 0.838187922728821, "grad_norm": 0.48702260444518675, "learning_rate": 2.630202272626268e-06, "loss": 0.0512, "step": 10766 }, { "epoch": 0.8382657778210306, "grad_norm": 0.47177532468895533, "learning_rate": 2.627728341426727e-06, "loss": 0.0448, "step": 10767 }, { "epoch": 0.8383436329132402, "grad_norm": 0.5460253400837043, "learning_rate": 2.625255492438048e-06, "loss": 0.0644, "step": 10768 }, { "epoch": 0.8384214880054499, "grad_norm": 0.3736037672106624, "learning_rate": 2.6227837258142886e-06, "loss": 0.0338, "step": 10769 }, { "epoch": 0.8384993430976595, "grad_norm": 0.4295928275509528, "learning_rate": 2.620313041709426e-06, "loss": 0.0417, "step": 10770 }, { "epoch": 0.8385771981898691, "grad_norm": 0.49438076146201504, "learning_rate": 2.617843440277366e-06, "loss": 0.038, "step": 10771 }, { "epoch": 0.8386550532820787, "grad_norm": 0.43753783405414576, "learning_rate": 2.615374921671956e-06, "loss": 0.0475, "step": 10772 }, { "epoch": 0.8387329083742884, "grad_norm": 0.45997633916883757, "learning_rate": 2.6129074860469674e-06, "loss": 0.0455, "step": 10773 }, { "epoch": 0.838810763466498, "grad_norm": 0.485386367282827, "learning_rate": 2.610441133556123e-06, "loss": 0.0559, "step": 10774 }, { "epoch": 0.8388886185587076, "grad_norm": 0.40955716133185965, "learning_rate": 2.607975864353056e-06, "loss": 0.0439, "step": 10775 }, { "epoch": 0.8389664736509173, "grad_norm": 0.4934245226724282, "learning_rate": 2.605511678591344e-06, "loss": 0.037, "step": 10776 }, { "epoch": 0.8390443287431268, "grad_norm": 0.5194073764641388, "learning_rate": 2.6030485764244918e-06, "loss": 0.0511, "step": 10777 }, { "epoch": 0.8391221838353364, "grad_norm": 0.43732494356273865, "learning_rate": 2.6005865580059376e-06, "loss": 0.0352, "step": 10778 }, { "epoch": 0.8392000389275461, "grad_norm": 0.5340494810370721, "learning_rate": 2.598125623489063e-06, "loss": 0.0442, "step": 10779 }, { "epoch": 0.8392778940197557, "grad_norm": 0.5498433368996204, "learning_rate": 2.595665773027163e-06, "loss": 0.0677, "step": 10780 }, { "epoch": 0.8393557491119653, "grad_norm": 0.5340004936354243, "learning_rate": 2.5932070067734794e-06, "loss": 0.063, "step": 10781 }, { "epoch": 0.839433604204175, "grad_norm": 0.5143080873145243, "learning_rate": 2.5907493248811767e-06, "loss": 0.0535, "step": 10782 }, { "epoch": 0.8395114592963846, "grad_norm": 0.44944603871477046, "learning_rate": 2.588292727503359e-06, "loss": 0.039, "step": 10783 }, { "epoch": 0.8395893143885942, "grad_norm": 0.4029388436538433, "learning_rate": 2.5858372147930655e-06, "loss": 0.0301, "step": 10784 }, { "epoch": 0.8396671694808039, "grad_norm": 0.42309496639196786, "learning_rate": 2.583382786903259e-06, "loss": 0.0425, "step": 10785 }, { "epoch": 0.8397450245730135, "grad_norm": 0.43902972799874906, "learning_rate": 2.5809294439868394e-06, "loss": 0.0413, "step": 10786 }, { "epoch": 0.8398228796652231, "grad_norm": 0.3793837212926977, "learning_rate": 2.5784771861966395e-06, "loss": 0.0263, "step": 10787 }, { "epoch": 0.8399007347574328, "grad_norm": 0.5015184454871242, "learning_rate": 2.5760260136854222e-06, "loss": 0.0436, "step": 10788 }, { "epoch": 0.8399785898496424, "grad_norm": 0.4460141722683843, "learning_rate": 2.5735759266058847e-06, "loss": 0.0413, "step": 10789 }, { "epoch": 0.8400564449418519, "grad_norm": 0.46190832300719636, "learning_rate": 2.5711269251106543e-06, "loss": 0.0523, "step": 10790 }, { "epoch": 0.8401343000340616, "grad_norm": 0.45462964803373, "learning_rate": 2.568679009352295e-06, "loss": 0.043, "step": 10791 }, { "epoch": 0.8402121551262712, "grad_norm": 0.5359096965957606, "learning_rate": 2.566232179483299e-06, "loss": 0.0512, "step": 10792 }, { "epoch": 0.8402900102184808, "grad_norm": 0.43497478754866664, "learning_rate": 2.5637864356560905e-06, "loss": 0.039, "step": 10793 }, { "epoch": 0.8403678653106905, "grad_norm": 0.499704668975231, "learning_rate": 2.5613417780230275e-06, "loss": 0.0492, "step": 10794 }, { "epoch": 0.8404457204029001, "grad_norm": 0.4845228178973749, "learning_rate": 2.5588982067364043e-06, "loss": 0.0586, "step": 10795 }, { "epoch": 0.8405235754951097, "grad_norm": 0.38554035947783916, "learning_rate": 2.5564557219484454e-06, "loss": 0.0315, "step": 10796 }, { "epoch": 0.8406014305873194, "grad_norm": 0.6231982310412999, "learning_rate": 2.5540143238113026e-06, "loss": 0.061, "step": 10797 }, { "epoch": 0.840679285679529, "grad_norm": 0.5539742428956311, "learning_rate": 2.5515740124770625e-06, "loss": 0.0616, "step": 10798 }, { "epoch": 0.8407571407717386, "grad_norm": 0.40876424513477677, "learning_rate": 2.549134788097747e-06, "loss": 0.0352, "step": 10799 }, { "epoch": 0.8408349958639483, "grad_norm": 0.5637650162784886, "learning_rate": 2.5466966508253087e-06, "loss": 0.0653, "step": 10800 }, { "epoch": 0.8408349958639483, "eval_loss": 0.005489041097462177, "eval_runtime": 162.243, "eval_samples_per_second": 17.751, "eval_steps_per_second": 0.635, "step": 10800 }, { "epoch": 0.8409128509561579, "grad_norm": 0.4657099982773829, "learning_rate": 2.5442596008116315e-06, "loss": 0.0484, "step": 10801 }, { "epoch": 0.8409907060483675, "grad_norm": 0.6530152771663874, "learning_rate": 2.5418236382085314e-06, "loss": 0.0759, "step": 10802 }, { "epoch": 0.8410685611405772, "grad_norm": 0.4719150110653249, "learning_rate": 2.539388763167756e-06, "loss": 0.0381, "step": 10803 }, { "epoch": 0.8411464162327867, "grad_norm": 0.518107853391687, "learning_rate": 2.5369549758409863e-06, "loss": 0.0488, "step": 10804 }, { "epoch": 0.8412242713249963, "grad_norm": 0.5274528558204015, "learning_rate": 2.5345222763798426e-06, "loss": 0.0629, "step": 10805 }, { "epoch": 0.8413021264172059, "grad_norm": 0.40056022799121216, "learning_rate": 2.5320906649358667e-06, "loss": 0.0367, "step": 10806 }, { "epoch": 0.8413799815094156, "grad_norm": 0.446952243581011, "learning_rate": 2.5296601416605348e-06, "loss": 0.0459, "step": 10807 }, { "epoch": 0.8414578366016252, "grad_norm": 0.42645145859124783, "learning_rate": 2.527230706705257e-06, "loss": 0.0427, "step": 10808 }, { "epoch": 0.8415356916938348, "grad_norm": 0.3996556456104276, "learning_rate": 2.524802360221379e-06, "loss": 0.0394, "step": 10809 }, { "epoch": 0.8416135467860445, "grad_norm": 0.41870901392394055, "learning_rate": 2.5223751023601727e-06, "loss": 0.0395, "step": 10810 }, { "epoch": 0.8416914018782541, "grad_norm": 0.43587988862955485, "learning_rate": 2.5199489332728443e-06, "loss": 0.0275, "step": 10811 }, { "epoch": 0.8417692569704637, "grad_norm": 0.46233043763560083, "learning_rate": 2.517523853110535e-06, "loss": 0.0485, "step": 10812 }, { "epoch": 0.8418471120626734, "grad_norm": 0.4122287723497772, "learning_rate": 2.515099862024315e-06, "loss": 0.0347, "step": 10813 }, { "epoch": 0.841924967154883, "grad_norm": 0.5210312779844863, "learning_rate": 2.5126769601651837e-06, "loss": 0.0562, "step": 10814 }, { "epoch": 0.8420028222470926, "grad_norm": 0.5403584195317221, "learning_rate": 2.510255147684082e-06, "loss": 0.0649, "step": 10815 }, { "epoch": 0.8420806773393023, "grad_norm": 0.36653055862491996, "learning_rate": 2.5078344247318766e-06, "loss": 0.0305, "step": 10816 }, { "epoch": 0.8421585324315118, "grad_norm": 0.3115062173101346, "learning_rate": 2.5054147914593665e-06, "loss": 0.0183, "step": 10817 }, { "epoch": 0.8422363875237214, "grad_norm": 0.3794250328569936, "learning_rate": 2.502996248017282e-06, "loss": 0.0387, "step": 10818 }, { "epoch": 0.8423142426159311, "grad_norm": 0.38871443093533137, "learning_rate": 2.50057879455629e-06, "loss": 0.0247, "step": 10819 }, { "epoch": 0.8423920977081407, "grad_norm": 0.356921035222141, "learning_rate": 2.4981624312269826e-06, "loss": 0.0226, "step": 10820 }, { "epoch": 0.8424699528003503, "grad_norm": 0.33647131583887574, "learning_rate": 2.495747158179891e-06, "loss": 0.0282, "step": 10821 }, { "epoch": 0.84254780789256, "grad_norm": 0.4469873373698844, "learning_rate": 2.4933329755654743e-06, "loss": 0.0413, "step": 10822 }, { "epoch": 0.8426256629847696, "grad_norm": 0.3861772570247501, "learning_rate": 2.490919883534124e-06, "loss": 0.0322, "step": 10823 }, { "epoch": 0.8427035180769792, "grad_norm": 0.4539335358402261, "learning_rate": 2.4885078822361663e-06, "loss": 0.0437, "step": 10824 }, { "epoch": 0.8427813731691889, "grad_norm": 0.51298933444705, "learning_rate": 2.4860969718218497e-06, "loss": 0.046, "step": 10825 }, { "epoch": 0.8428592282613985, "grad_norm": 0.4639376286466309, "learning_rate": 2.483687152441374e-06, "loss": 0.042, "step": 10826 }, { "epoch": 0.8429370833536081, "grad_norm": 0.4012567280541136, "learning_rate": 2.4812784242448573e-06, "loss": 0.039, "step": 10827 }, { "epoch": 0.8430149384458178, "grad_norm": 0.49722872240422344, "learning_rate": 2.4788707873823504e-06, "loss": 0.0465, "step": 10828 }, { "epoch": 0.8430927935380274, "grad_norm": 0.45706048678561806, "learning_rate": 2.4764642420038354e-06, "loss": 0.0454, "step": 10829 }, { "epoch": 0.8431706486302369, "grad_norm": 0.45835787604846, "learning_rate": 2.4740587882592236e-06, "loss": 0.0388, "step": 10830 }, { "epoch": 0.8432485037224466, "grad_norm": 0.4124552328444452, "learning_rate": 2.4716544262983755e-06, "loss": 0.0397, "step": 10831 }, { "epoch": 0.8433263588146562, "grad_norm": 0.4461783109469881, "learning_rate": 2.4692511562710663e-06, "loss": 0.05, "step": 10832 }, { "epoch": 0.8434042139068658, "grad_norm": 0.37736718219251014, "learning_rate": 2.466848978327008e-06, "loss": 0.0361, "step": 10833 }, { "epoch": 0.8434820689990755, "grad_norm": 0.4894819580897512, "learning_rate": 2.4644478926158444e-06, "loss": 0.0533, "step": 10834 }, { "epoch": 0.8435599240912851, "grad_norm": 0.4862531750038596, "learning_rate": 2.4620478992871498e-06, "loss": 0.0492, "step": 10835 }, { "epoch": 0.8436377791834947, "grad_norm": 0.46930577351827185, "learning_rate": 2.4596489984904403e-06, "loss": 0.0368, "step": 10836 }, { "epoch": 0.8437156342757044, "grad_norm": 0.43240113489181836, "learning_rate": 2.457251190375154e-06, "loss": 0.0299, "step": 10837 }, { "epoch": 0.843793489367914, "grad_norm": 0.3126819544442137, "learning_rate": 2.4548544750906555e-06, "loss": 0.0167, "step": 10838 }, { "epoch": 0.8438713444601236, "grad_norm": 0.41794972324584406, "learning_rate": 2.4524588527862546e-06, "loss": 0.0318, "step": 10839 }, { "epoch": 0.8439491995523333, "grad_norm": 0.3870915160517912, "learning_rate": 2.4500643236111833e-06, "loss": 0.0321, "step": 10840 }, { "epoch": 0.8440270546445429, "grad_norm": 0.5289647540832918, "learning_rate": 2.447670887714615e-06, "loss": 0.0522, "step": 10841 }, { "epoch": 0.8441049097367525, "grad_norm": 0.5171324731373814, "learning_rate": 2.4452785452456486e-06, "loss": 0.0465, "step": 10842 }, { "epoch": 0.844182764828962, "grad_norm": 0.38195915103337724, "learning_rate": 2.4428872963533136e-06, "loss": 0.027, "step": 10843 }, { "epoch": 0.8442606199211717, "grad_norm": 0.44388102375137334, "learning_rate": 2.4404971411865754e-06, "loss": 0.0503, "step": 10844 }, { "epoch": 0.8443384750133813, "grad_norm": 0.43309456307242594, "learning_rate": 2.4381080798943234e-06, "loss": 0.0347, "step": 10845 }, { "epoch": 0.8444163301055909, "grad_norm": 0.575373598205034, "learning_rate": 2.435720112625397e-06, "loss": 0.0621, "step": 10846 }, { "epoch": 0.8444941851978006, "grad_norm": 0.3157795273565401, "learning_rate": 2.4333332395285456e-06, "loss": 0.0223, "step": 10847 }, { "epoch": 0.8445720402900102, "grad_norm": 0.4757114295683244, "learning_rate": 2.430947460752462e-06, "loss": 0.0423, "step": 10848 }, { "epoch": 0.8446498953822198, "grad_norm": 0.4452361459109785, "learning_rate": 2.428562776445771e-06, "loss": 0.0313, "step": 10849 }, { "epoch": 0.8447277504744295, "grad_norm": 0.4793527258170137, "learning_rate": 2.4261791867570207e-06, "loss": 0.0499, "step": 10850 }, { "epoch": 0.8447277504744295, "eval_loss": 0.00544706778600812, "eval_runtime": 162.2183, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 10850 }, { "epoch": 0.8448056055666391, "grad_norm": 0.3534939917513903, "learning_rate": 2.4237966918347056e-06, "loss": 0.0251, "step": 10851 }, { "epoch": 0.8448834606588487, "grad_norm": 0.48593737688009914, "learning_rate": 2.4214152918272428e-06, "loss": 0.0421, "step": 10852 }, { "epoch": 0.8449613157510584, "grad_norm": 0.5457323009549179, "learning_rate": 2.419034986882982e-06, "loss": 0.0689, "step": 10853 }, { "epoch": 0.845039170843268, "grad_norm": 0.4568695157091789, "learning_rate": 2.416655777150201e-06, "loss": 0.0401, "step": 10854 }, { "epoch": 0.8451170259354776, "grad_norm": 0.45668795986811533, "learning_rate": 2.414277662777118e-06, "loss": 0.0341, "step": 10855 }, { "epoch": 0.8451948810276873, "grad_norm": 0.5430300788892807, "learning_rate": 2.4119006439118755e-06, "loss": 0.0663, "step": 10856 }, { "epoch": 0.8452727361198968, "grad_norm": 0.5446733529579603, "learning_rate": 2.409524720702552e-06, "loss": 0.0504, "step": 10857 }, { "epoch": 0.8453505912121064, "grad_norm": 0.4168022217193882, "learning_rate": 2.407149893297156e-06, "loss": 0.0282, "step": 10858 }, { "epoch": 0.8454284463043161, "grad_norm": 0.5103031733647188, "learning_rate": 2.404776161843627e-06, "loss": 0.0518, "step": 10859 }, { "epoch": 0.8455063013965257, "grad_norm": 0.35425144319206625, "learning_rate": 2.4024035264898336e-06, "loss": 0.0225, "step": 10860 }, { "epoch": 0.8455841564887353, "grad_norm": 0.4231295163071963, "learning_rate": 2.4000319873835907e-06, "loss": 0.0403, "step": 10861 }, { "epoch": 0.845662011580945, "grad_norm": 0.42598098905578546, "learning_rate": 2.397661544672627e-06, "loss": 0.0279, "step": 10862 }, { "epoch": 0.8457398666731546, "grad_norm": 0.5377090424296543, "learning_rate": 2.3952921985046106e-06, "loss": 0.0453, "step": 10863 }, { "epoch": 0.8458177217653642, "grad_norm": 0.45810494815907143, "learning_rate": 2.3929239490271393e-06, "loss": 0.0441, "step": 10864 }, { "epoch": 0.8458955768575739, "grad_norm": 0.739022914327128, "learning_rate": 2.3905567963877484e-06, "loss": 0.0367, "step": 10865 }, { "epoch": 0.8459734319497835, "grad_norm": 0.509377296297267, "learning_rate": 2.3881907407338957e-06, "loss": 0.0479, "step": 10866 }, { "epoch": 0.8460512870419931, "grad_norm": 0.4143299044139937, "learning_rate": 2.385825782212976e-06, "loss": 0.0319, "step": 10867 }, { "epoch": 0.8461291421342028, "grad_norm": 0.42705633877019283, "learning_rate": 2.3834619209723185e-06, "loss": 0.0344, "step": 10868 }, { "epoch": 0.8462069972264123, "grad_norm": 0.4401315674714666, "learning_rate": 2.3810991571591767e-06, "loss": 0.0435, "step": 10869 }, { "epoch": 0.8462848523186219, "grad_norm": 0.5598699869951559, "learning_rate": 2.3787374909207418e-06, "loss": 0.0541, "step": 10870 }, { "epoch": 0.8463627074108316, "grad_norm": 0.44418027026948603, "learning_rate": 2.376376922404129e-06, "loss": 0.0342, "step": 10871 }, { "epoch": 0.8464405625030412, "grad_norm": 0.5709429275259993, "learning_rate": 2.3740174517564008e-06, "loss": 0.052, "step": 10872 }, { "epoch": 0.8465184175952508, "grad_norm": 0.5590623195985113, "learning_rate": 2.3716590791245354e-06, "loss": 0.041, "step": 10873 }, { "epoch": 0.8465962726874605, "grad_norm": 0.4937829306498326, "learning_rate": 2.369301804655448e-06, "loss": 0.0471, "step": 10874 }, { "epoch": 0.8466741277796701, "grad_norm": 0.47566033575939093, "learning_rate": 2.366945628495989e-06, "loss": 0.0447, "step": 10875 }, { "epoch": 0.8467519828718797, "grad_norm": 0.4499002704559234, "learning_rate": 2.3645905507929334e-06, "loss": 0.0461, "step": 10876 }, { "epoch": 0.8468298379640894, "grad_norm": 0.3949543607026523, "learning_rate": 2.362236571692993e-06, "loss": 0.0371, "step": 10877 }, { "epoch": 0.846907693056299, "grad_norm": 0.4417269487242611, "learning_rate": 2.359883691342808e-06, "loss": 0.0373, "step": 10878 }, { "epoch": 0.8469855481485086, "grad_norm": 0.5855458843227453, "learning_rate": 2.357531909888955e-06, "loss": 0.0706, "step": 10879 }, { "epoch": 0.8470634032407182, "grad_norm": 0.7023931089356653, "learning_rate": 2.3551812274779363e-06, "loss": 0.0661, "step": 10880 }, { "epoch": 0.8471412583329279, "grad_norm": 0.3596690533132591, "learning_rate": 2.352831644256184e-06, "loss": 0.0283, "step": 10881 }, { "epoch": 0.8472191134251374, "grad_norm": 0.49718298455292304, "learning_rate": 2.350483160370076e-06, "loss": 0.0367, "step": 10882 }, { "epoch": 0.847296968517347, "grad_norm": 0.4248345019977233, "learning_rate": 2.3481357759659075e-06, "loss": 0.0415, "step": 10883 }, { "epoch": 0.8473748236095567, "grad_norm": 0.2760851774800927, "learning_rate": 2.345789491189909e-06, "loss": 0.0158, "step": 10884 }, { "epoch": 0.8474526787017663, "grad_norm": 0.4577802553137717, "learning_rate": 2.3434443061882428e-06, "loss": 0.0318, "step": 10885 }, { "epoch": 0.8475305337939759, "grad_norm": 0.37977028647500977, "learning_rate": 2.3411002211070046e-06, "loss": 0.0341, "step": 10886 }, { "epoch": 0.8476083888861856, "grad_norm": 0.6143030452140151, "learning_rate": 2.338757236092217e-06, "loss": 0.0693, "step": 10887 }, { "epoch": 0.8476862439783952, "grad_norm": 0.44457256241237675, "learning_rate": 2.336415351289838e-06, "loss": 0.044, "step": 10888 }, { "epoch": 0.8477640990706048, "grad_norm": 0.44584670745715194, "learning_rate": 2.3340745668457566e-06, "loss": 0.0465, "step": 10889 }, { "epoch": 0.8478419541628145, "grad_norm": 0.39928038800621934, "learning_rate": 2.331734882905794e-06, "loss": 0.0465, "step": 10890 }, { "epoch": 0.8479198092550241, "grad_norm": 0.4471199677014888, "learning_rate": 2.3293962996156938e-06, "loss": 0.0463, "step": 10891 }, { "epoch": 0.8479976643472337, "grad_norm": 0.4574236180954197, "learning_rate": 2.3270588171211504e-06, "loss": 0.0365, "step": 10892 }, { "epoch": 0.8480755194394434, "grad_norm": 0.633124337602939, "learning_rate": 2.3247224355677722e-06, "loss": 0.0804, "step": 10893 }, { "epoch": 0.848153374531653, "grad_norm": 0.4184661184138638, "learning_rate": 2.3223871551011045e-06, "loss": 0.0264, "step": 10894 }, { "epoch": 0.8482312296238625, "grad_norm": 0.5089657929325889, "learning_rate": 2.3200529758666314e-06, "loss": 0.0537, "step": 10895 }, { "epoch": 0.8483090847160722, "grad_norm": 0.43789977085292625, "learning_rate": 2.317719898009745e-06, "loss": 0.0337, "step": 10896 }, { "epoch": 0.8483869398082818, "grad_norm": 0.4082029864987646, "learning_rate": 2.315387921675798e-06, "loss": 0.0355, "step": 10897 }, { "epoch": 0.8484647949004914, "grad_norm": 0.4621437655701374, "learning_rate": 2.313057047010059e-06, "loss": 0.0352, "step": 10898 }, { "epoch": 0.8485426499927011, "grad_norm": 0.5329873681376316, "learning_rate": 2.3107272741577313e-06, "loss": 0.0574, "step": 10899 }, { "epoch": 0.8486205050849107, "grad_norm": 0.3939887012905058, "learning_rate": 2.308398603263946e-06, "loss": 0.0378, "step": 10900 }, { "epoch": 0.8486205050849107, "eval_loss": 0.005439699627459049, "eval_runtime": 163.3373, "eval_samples_per_second": 17.632, "eval_steps_per_second": 0.631, "step": 10900 }, { "epoch": 0.8486983601771203, "grad_norm": 0.40326649508118706, "learning_rate": 2.3060710344737714e-06, "loss": 0.0352, "step": 10901 }, { "epoch": 0.84877621526933, "grad_norm": 0.4032116660405265, "learning_rate": 2.303744567932198e-06, "loss": 0.0278, "step": 10902 }, { "epoch": 0.8488540703615396, "grad_norm": 0.5273305608515402, "learning_rate": 2.301419203784161e-06, "loss": 0.0481, "step": 10903 }, { "epoch": 0.8489319254537492, "grad_norm": 0.46639513209155414, "learning_rate": 2.2990949421745225e-06, "loss": 0.0419, "step": 10904 }, { "epoch": 0.8490097805459589, "grad_norm": 0.4123736813416601, "learning_rate": 2.2967717832480616e-06, "loss": 0.0322, "step": 10905 }, { "epoch": 0.8490876356381685, "grad_norm": 0.5069923833131905, "learning_rate": 2.294449727149508e-06, "loss": 0.0532, "step": 10906 }, { "epoch": 0.8491654907303781, "grad_norm": 0.4656836482880704, "learning_rate": 2.2921287740235077e-06, "loss": 0.0332, "step": 10907 }, { "epoch": 0.8492433458225878, "grad_norm": 0.31372421412029633, "learning_rate": 2.289808924014656e-06, "loss": 0.0222, "step": 10908 }, { "epoch": 0.8493212009147973, "grad_norm": 0.4202119083632515, "learning_rate": 2.2874901772674616e-06, "loss": 0.0442, "step": 10909 }, { "epoch": 0.8493990560070069, "grad_norm": 0.495519675575891, "learning_rate": 2.285172533926374e-06, "loss": 0.0509, "step": 10910 }, { "epoch": 0.8494769110992166, "grad_norm": 0.4875250496518311, "learning_rate": 2.282855994135769e-06, "loss": 0.0496, "step": 10911 }, { "epoch": 0.8495547661914262, "grad_norm": 0.37762325987839485, "learning_rate": 2.280540558039954e-06, "loss": 0.0281, "step": 10912 }, { "epoch": 0.8496326212836358, "grad_norm": 0.4692487244766567, "learning_rate": 2.278226225783182e-06, "loss": 0.0336, "step": 10913 }, { "epoch": 0.8497104763758454, "grad_norm": 0.3168080608237442, "learning_rate": 2.275912997509613e-06, "loss": 0.0241, "step": 10914 }, { "epoch": 0.8497883314680551, "grad_norm": 0.46467952670834967, "learning_rate": 2.273600873363353e-06, "loss": 0.044, "step": 10915 }, { "epoch": 0.8498661865602647, "grad_norm": 0.3824385797978647, "learning_rate": 2.2712898534884366e-06, "loss": 0.0346, "step": 10916 }, { "epoch": 0.8499440416524743, "grad_norm": 0.34785202413164773, "learning_rate": 2.268979938028828e-06, "loss": 0.0195, "step": 10917 }, { "epoch": 0.850021896744684, "grad_norm": 0.38429438864731613, "learning_rate": 2.2666711271284282e-06, "loss": 0.031, "step": 10918 }, { "epoch": 0.8500997518368936, "grad_norm": 0.4785499474885822, "learning_rate": 2.264363420931064e-06, "loss": 0.0447, "step": 10919 }, { "epoch": 0.8501776069291032, "grad_norm": 0.3846308616257138, "learning_rate": 2.2620568195804938e-06, "loss": 0.0295, "step": 10920 }, { "epoch": 0.8502554620213129, "grad_norm": 0.43882059933630857, "learning_rate": 2.259751323220407e-06, "loss": 0.0408, "step": 10921 }, { "epoch": 0.8503333171135224, "grad_norm": 0.42793438414643586, "learning_rate": 2.257446931994427e-06, "loss": 0.0398, "step": 10922 }, { "epoch": 0.850411172205732, "grad_norm": 0.5428716200676084, "learning_rate": 2.2551436460461073e-06, "loss": 0.0603, "step": 10923 }, { "epoch": 0.8504890272979417, "grad_norm": 0.45766643706164406, "learning_rate": 2.252841465518929e-06, "loss": 0.0374, "step": 10924 }, { "epoch": 0.8505668823901513, "grad_norm": 0.5194006783296594, "learning_rate": 2.2505403905563084e-06, "loss": 0.0622, "step": 10925 }, { "epoch": 0.8506447374823609, "grad_norm": 0.4021491992598379, "learning_rate": 2.2482404213015906e-06, "loss": 0.0345, "step": 10926 }, { "epoch": 0.8507225925745706, "grad_norm": 0.3996253933738498, "learning_rate": 2.2459415578980504e-06, "loss": 0.0356, "step": 10927 }, { "epoch": 0.8508004476667802, "grad_norm": 0.445818748415604, "learning_rate": 2.2436438004889037e-06, "loss": 0.0497, "step": 10928 }, { "epoch": 0.8508783027589898, "grad_norm": 0.48824883421289794, "learning_rate": 2.2413471492172854e-06, "loss": 0.0549, "step": 10929 }, { "epoch": 0.8509561578511995, "grad_norm": 0.3937173017022586, "learning_rate": 2.23905160422627e-06, "loss": 0.0342, "step": 10930 }, { "epoch": 0.8510340129434091, "grad_norm": 0.5075314137988425, "learning_rate": 2.2367571656588517e-06, "loss": 0.0489, "step": 10931 }, { "epoch": 0.8511118680356187, "grad_norm": 0.5000346537592041, "learning_rate": 2.234463833657969e-06, "loss": 0.0413, "step": 10932 }, { "epoch": 0.8511897231278284, "grad_norm": 0.548586358998828, "learning_rate": 2.232171608366485e-06, "loss": 0.0499, "step": 10933 }, { "epoch": 0.851267578220038, "grad_norm": 0.3863353211465972, "learning_rate": 2.2298804899271918e-06, "loss": 0.0273, "step": 10934 }, { "epoch": 0.8513454333122475, "grad_norm": 0.40750937638739093, "learning_rate": 2.2275904784828194e-06, "loss": 0.0333, "step": 10935 }, { "epoch": 0.8514232884044572, "grad_norm": 0.40058046280385234, "learning_rate": 2.22530157417602e-06, "loss": 0.0355, "step": 10936 }, { "epoch": 0.8515011434966668, "grad_norm": 0.47492235955934536, "learning_rate": 2.223013777149381e-06, "loss": 0.0416, "step": 10937 }, { "epoch": 0.8515789985888764, "grad_norm": 0.2985594659928975, "learning_rate": 2.220727087545429e-06, "loss": 0.0214, "step": 10938 }, { "epoch": 0.8516568536810861, "grad_norm": 0.6638475505583412, "learning_rate": 2.21844150550661e-06, "loss": 0.065, "step": 10939 }, { "epoch": 0.8517347087732957, "grad_norm": 0.35481158208880265, "learning_rate": 2.2161570311753034e-06, "loss": 0.0237, "step": 10940 }, { "epoch": 0.8518125638655053, "grad_norm": 0.5482295269530616, "learning_rate": 2.213873664693822e-06, "loss": 0.0552, "step": 10941 }, { "epoch": 0.851890418957715, "grad_norm": 0.395681142676802, "learning_rate": 2.21159140620441e-06, "loss": 0.0241, "step": 10942 }, { "epoch": 0.8519682740499246, "grad_norm": 0.46414549749864387, "learning_rate": 2.209310255849242e-06, "loss": 0.0481, "step": 10943 }, { "epoch": 0.8520461291421342, "grad_norm": 0.3908372873087504, "learning_rate": 2.207030213770418e-06, "loss": 0.0373, "step": 10944 }, { "epoch": 0.8521239842343439, "grad_norm": 0.6985339985708362, "learning_rate": 2.2047512801099804e-06, "loss": 0.0682, "step": 10945 }, { "epoch": 0.8522018393265535, "grad_norm": 0.39506969379144086, "learning_rate": 2.2024734550098904e-06, "loss": 0.0308, "step": 10946 }, { "epoch": 0.8522796944187631, "grad_norm": 0.4031818592133215, "learning_rate": 2.2001967386120503e-06, "loss": 0.0322, "step": 10947 }, { "epoch": 0.8523575495109728, "grad_norm": 0.44356559094612197, "learning_rate": 2.1979211310582825e-06, "loss": 0.0476, "step": 10948 }, { "epoch": 0.8524354046031823, "grad_norm": 0.6202368128830461, "learning_rate": 2.1956466324903537e-06, "loss": 0.068, "step": 10949 }, { "epoch": 0.8525132596953919, "grad_norm": 0.4084088306440864, "learning_rate": 2.1933732430499543e-06, "loss": 0.0304, "step": 10950 }, { "epoch": 0.8525132596953919, "eval_loss": 0.005410196725279093, "eval_runtime": 162.5709, "eval_samples_per_second": 17.715, "eval_steps_per_second": 0.634, "step": 10950 }, { "epoch": 0.8525911147876015, "grad_norm": 0.526342838896852, "learning_rate": 2.1911009628787005e-06, "loss": 0.0574, "step": 10951 }, { "epoch": 0.8526689698798112, "grad_norm": 0.3616210903645695, "learning_rate": 2.18882979211815e-06, "loss": 0.0291, "step": 10952 }, { "epoch": 0.8527468249720208, "grad_norm": 0.34629471851749605, "learning_rate": 2.186559730909783e-06, "loss": 0.0267, "step": 10953 }, { "epoch": 0.8528246800642304, "grad_norm": 0.4958862888606893, "learning_rate": 2.184290779395013e-06, "loss": 0.0415, "step": 10954 }, { "epoch": 0.8529025351564401, "grad_norm": 0.3749640971839879, "learning_rate": 2.182022937715187e-06, "loss": 0.0304, "step": 10955 }, { "epoch": 0.8529803902486497, "grad_norm": 0.4409615032001562, "learning_rate": 2.1797562060115807e-06, "loss": 0.0337, "step": 10956 }, { "epoch": 0.8530582453408593, "grad_norm": 0.3889616546514737, "learning_rate": 2.177490584425399e-06, "loss": 0.0351, "step": 10957 }, { "epoch": 0.853136100433069, "grad_norm": 0.503529459386771, "learning_rate": 2.175226073097776e-06, "loss": 0.0582, "step": 10958 }, { "epoch": 0.8532139555252786, "grad_norm": 0.37331233058913965, "learning_rate": 2.1729626721697915e-06, "loss": 0.0321, "step": 10959 }, { "epoch": 0.8532918106174882, "grad_norm": 0.5078038813317471, "learning_rate": 2.1707003817824355e-06, "loss": 0.0559, "step": 10960 }, { "epoch": 0.8533696657096979, "grad_norm": 0.5234921797625358, "learning_rate": 2.168439202076642e-06, "loss": 0.0503, "step": 10961 }, { "epoch": 0.8534475208019074, "grad_norm": 0.4344674463912024, "learning_rate": 2.1661791331932736e-06, "loss": 0.0324, "step": 10962 }, { "epoch": 0.853525375894117, "grad_norm": 0.4302528450725387, "learning_rate": 2.163920175273111e-06, "loss": 0.0363, "step": 10963 }, { "epoch": 0.8536032309863267, "grad_norm": 0.4302961411698442, "learning_rate": 2.1616623284568905e-06, "loss": 0.0367, "step": 10964 }, { "epoch": 0.8536810860785363, "grad_norm": 0.45340385730391713, "learning_rate": 2.1594055928852574e-06, "loss": 0.0403, "step": 10965 }, { "epoch": 0.8537589411707459, "grad_norm": 0.4314235455273141, "learning_rate": 2.157149968698797e-06, "loss": 0.0375, "step": 10966 }, { "epoch": 0.8538367962629556, "grad_norm": 0.35522598184103416, "learning_rate": 2.154895456038026e-06, "loss": 0.0278, "step": 10967 }, { "epoch": 0.8539146513551652, "grad_norm": 0.2581958811678582, "learning_rate": 2.1526420550433856e-06, "loss": 0.0145, "step": 10968 }, { "epoch": 0.8539925064473748, "grad_norm": 0.5001415067094366, "learning_rate": 2.150389765855256e-06, "loss": 0.0461, "step": 10969 }, { "epoch": 0.8540703615395845, "grad_norm": 0.4912664960469359, "learning_rate": 2.148138588613946e-06, "loss": 0.0494, "step": 10970 }, { "epoch": 0.8541482166317941, "grad_norm": 0.5288130588364612, "learning_rate": 2.145888523459696e-06, "loss": 0.0498, "step": 10971 }, { "epoch": 0.8542260717240037, "grad_norm": 0.5663090432395758, "learning_rate": 2.143639570532663e-06, "loss": 0.0618, "step": 10972 }, { "epoch": 0.8543039268162134, "grad_norm": 0.3608367114419689, "learning_rate": 2.141391729972948e-06, "loss": 0.0254, "step": 10973 }, { "epoch": 0.854381781908423, "grad_norm": 0.4514958093894884, "learning_rate": 2.1391450019205906e-06, "loss": 0.031, "step": 10974 }, { "epoch": 0.8544596370006325, "grad_norm": 0.4361973192817373, "learning_rate": 2.1368993865155453e-06, "loss": 0.0321, "step": 10975 }, { "epoch": 0.8545374920928422, "grad_norm": 0.3670537824221249, "learning_rate": 2.1346548838977046e-06, "loss": 0.0306, "step": 10976 }, { "epoch": 0.8546153471850518, "grad_norm": 0.40582665274419727, "learning_rate": 2.1324114942068917e-06, "loss": 0.0298, "step": 10977 }, { "epoch": 0.8546932022772614, "grad_norm": 0.49288235501501204, "learning_rate": 2.130169217582856e-06, "loss": 0.0549, "step": 10978 }, { "epoch": 0.8547710573694711, "grad_norm": 0.47575869030377804, "learning_rate": 2.127928054165278e-06, "loss": 0.0377, "step": 10979 }, { "epoch": 0.8548489124616807, "grad_norm": 0.39907865090811384, "learning_rate": 2.1256880040937843e-06, "loss": 0.0369, "step": 10980 }, { "epoch": 0.8549267675538903, "grad_norm": 0.4474630949157731, "learning_rate": 2.1234490675079077e-06, "loss": 0.0415, "step": 10981 }, { "epoch": 0.8550046226461, "grad_norm": 0.45505661631769145, "learning_rate": 2.1212112445471276e-06, "loss": 0.0417, "step": 10982 }, { "epoch": 0.8550824777383096, "grad_norm": 0.5360076510513284, "learning_rate": 2.1189745353508483e-06, "loss": 0.0641, "step": 10983 }, { "epoch": 0.8551603328305192, "grad_norm": 0.44449279325304736, "learning_rate": 2.1167389400584027e-06, "loss": 0.039, "step": 10984 }, { "epoch": 0.8552381879227289, "grad_norm": 0.446329437099943, "learning_rate": 2.1145044588090678e-06, "loss": 0.0425, "step": 10985 }, { "epoch": 0.8553160430149385, "grad_norm": 0.43164175523343495, "learning_rate": 2.112271091742035e-06, "loss": 0.0361, "step": 10986 }, { "epoch": 0.855393898107148, "grad_norm": 0.5178800059811045, "learning_rate": 2.1100388389964332e-06, "loss": 0.0491, "step": 10987 }, { "epoch": 0.8554717531993576, "grad_norm": 0.40913171872772286, "learning_rate": 2.1078077007113217e-06, "loss": 0.039, "step": 10988 }, { "epoch": 0.8555496082915673, "grad_norm": 0.4833173250087731, "learning_rate": 2.1055776770256897e-06, "loss": 0.0532, "step": 10989 }, { "epoch": 0.8556274633837769, "grad_norm": 0.49668499297977203, "learning_rate": 2.103348768078457e-06, "loss": 0.056, "step": 10990 }, { "epoch": 0.8557053184759865, "grad_norm": 0.4698201985339539, "learning_rate": 2.1011209740084747e-06, "loss": 0.0436, "step": 10991 }, { "epoch": 0.8557831735681962, "grad_norm": 0.4800196904515439, "learning_rate": 2.0988942949545254e-06, "loss": 0.0427, "step": 10992 }, { "epoch": 0.8558610286604058, "grad_norm": 0.5034685139154943, "learning_rate": 2.096668731055316e-06, "loss": 0.0455, "step": 10993 }, { "epoch": 0.8559388837526154, "grad_norm": 0.6057811906281765, "learning_rate": 2.0944442824494883e-06, "loss": 0.0532, "step": 10994 }, { "epoch": 0.8560167388448251, "grad_norm": 0.44080488965128745, "learning_rate": 2.092220949275623e-06, "loss": 0.0339, "step": 10995 }, { "epoch": 0.8560945939370347, "grad_norm": 0.5046560359662454, "learning_rate": 2.089998731672218e-06, "loss": 0.055, "step": 10996 }, { "epoch": 0.8561724490292443, "grad_norm": 0.45145443332591856, "learning_rate": 2.0877776297777073e-06, "loss": 0.0418, "step": 10997 }, { "epoch": 0.856250304121454, "grad_norm": 0.459998723782163, "learning_rate": 2.085557643730458e-06, "loss": 0.0414, "step": 10998 }, { "epoch": 0.8563281592136636, "grad_norm": 0.3901222051159482, "learning_rate": 2.083338773668759e-06, "loss": 0.0365, "step": 10999 }, { "epoch": 0.8564060143058732, "grad_norm": 0.4671640409250499, "learning_rate": 2.0811210197308408e-06, "loss": 0.042, "step": 11000 }, { "epoch": 0.8564060143058732, "eval_loss": 0.005385367199778557, "eval_runtime": 162.2798, "eval_samples_per_second": 17.747, "eval_steps_per_second": 0.635, "step": 11000 }, { "epoch": 0.8564838693980829, "grad_norm": 0.46956814962904375, "learning_rate": 2.078904382054854e-06, "loss": 0.0403, "step": 11001 }, { "epoch": 0.8565617244902924, "grad_norm": 0.45408768551185413, "learning_rate": 2.0766888607788906e-06, "loss": 0.0362, "step": 11002 }, { "epoch": 0.856639579582502, "grad_norm": 0.3897502858681197, "learning_rate": 2.0744744560409624e-06, "loss": 0.0387, "step": 11003 }, { "epoch": 0.8567174346747117, "grad_norm": 0.392390031075325, "learning_rate": 2.072261167979015e-06, "loss": 0.0339, "step": 11004 }, { "epoch": 0.8567952897669213, "grad_norm": 0.4173723826825989, "learning_rate": 2.0700489967309335e-06, "loss": 0.0321, "step": 11005 }, { "epoch": 0.8568731448591309, "grad_norm": 0.3553437094030025, "learning_rate": 2.067837942434521e-06, "loss": 0.0187, "step": 11006 }, { "epoch": 0.8569509999513406, "grad_norm": 0.5064583717584442, "learning_rate": 2.0656280052275135e-06, "loss": 0.0597, "step": 11007 }, { "epoch": 0.8570288550435502, "grad_norm": 0.33459590582066345, "learning_rate": 2.063419185247586e-06, "loss": 0.0227, "step": 11008 }, { "epoch": 0.8571067101357598, "grad_norm": 0.44151903787367797, "learning_rate": 2.0612114826323327e-06, "loss": 0.038, "step": 11009 }, { "epoch": 0.8571845652279695, "grad_norm": 0.43638041628457086, "learning_rate": 2.0590048975192833e-06, "loss": 0.0403, "step": 11010 }, { "epoch": 0.8572624203201791, "grad_norm": 0.47110963444768394, "learning_rate": 2.0567994300458992e-06, "loss": 0.0463, "step": 11011 }, { "epoch": 0.8573402754123887, "grad_norm": 0.3124810470114081, "learning_rate": 2.0545950803495706e-06, "loss": 0.0176, "step": 11012 }, { "epoch": 0.8574181305045984, "grad_norm": 0.3464496267761453, "learning_rate": 2.0523918485676164e-06, "loss": 0.0179, "step": 11013 }, { "epoch": 0.857495985596808, "grad_norm": 0.38703044518691876, "learning_rate": 2.0501897348372867e-06, "loss": 0.0302, "step": 11014 }, { "epoch": 0.8575738406890175, "grad_norm": 0.48907370974363273, "learning_rate": 2.0479887392957674e-06, "loss": 0.0475, "step": 11015 }, { "epoch": 0.8576516957812272, "grad_norm": 0.5543771969956356, "learning_rate": 2.045788862080167e-06, "loss": 0.0502, "step": 11016 }, { "epoch": 0.8577295508734368, "grad_norm": 0.40058322564220256, "learning_rate": 2.0435901033275307e-06, "loss": 0.0315, "step": 11017 }, { "epoch": 0.8578074059656464, "grad_norm": 0.40459318007083545, "learning_rate": 2.041392463174827e-06, "loss": 0.0341, "step": 11018 }, { "epoch": 0.8578852610578561, "grad_norm": 0.4189535441603122, "learning_rate": 2.0391959417589647e-06, "loss": 0.04, "step": 11019 }, { "epoch": 0.8579631161500657, "grad_norm": 0.37961880940509535, "learning_rate": 2.0370005392167626e-06, "loss": 0.0293, "step": 11020 }, { "epoch": 0.8580409712422753, "grad_norm": 0.4559295661727611, "learning_rate": 2.034806255684998e-06, "loss": 0.0402, "step": 11021 }, { "epoch": 0.8581188263344849, "grad_norm": 0.39266013077094625, "learning_rate": 2.0326130913003596e-06, "loss": 0.0257, "step": 11022 }, { "epoch": 0.8581966814266946, "grad_norm": 0.4213258614851509, "learning_rate": 2.030421046199471e-06, "loss": 0.0346, "step": 11023 }, { "epoch": 0.8582745365189042, "grad_norm": 0.39515114688454844, "learning_rate": 2.028230120518888e-06, "loss": 0.0294, "step": 11024 }, { "epoch": 0.8583523916111138, "grad_norm": 0.4762004662707039, "learning_rate": 2.02604031439509e-06, "loss": 0.0401, "step": 11025 }, { "epoch": 0.8584302467033235, "grad_norm": 0.6414663526802125, "learning_rate": 2.0238516279644992e-06, "loss": 0.0732, "step": 11026 }, { "epoch": 0.858508101795533, "grad_norm": 0.3800133518268792, "learning_rate": 2.021664061363455e-06, "loss": 0.0289, "step": 11027 }, { "epoch": 0.8585859568877426, "grad_norm": 0.46784725551891454, "learning_rate": 2.01947761472824e-06, "loss": 0.0379, "step": 11028 }, { "epoch": 0.8586638119799523, "grad_norm": 0.4111763106510537, "learning_rate": 2.0172922881950473e-06, "loss": 0.0365, "step": 11029 }, { "epoch": 0.8587416670721619, "grad_norm": 0.4847754720914854, "learning_rate": 2.015108081900017e-06, "loss": 0.0477, "step": 11030 }, { "epoch": 0.8588195221643715, "grad_norm": 0.38554385775808975, "learning_rate": 2.0129249959792196e-06, "loss": 0.0299, "step": 11031 }, { "epoch": 0.8588973772565812, "grad_norm": 0.3829726045249541, "learning_rate": 2.0107430305686493e-06, "loss": 0.0258, "step": 11032 }, { "epoch": 0.8589752323487908, "grad_norm": 0.4346676638956387, "learning_rate": 2.0085621858042303e-06, "loss": 0.037, "step": 11033 }, { "epoch": 0.8590530874410004, "grad_norm": 0.49783328623075973, "learning_rate": 2.0063824618218208e-06, "loss": 0.0352, "step": 11034 }, { "epoch": 0.8591309425332101, "grad_norm": 0.4707320521805808, "learning_rate": 2.004203858757201e-06, "loss": 0.0451, "step": 11035 }, { "epoch": 0.8592087976254197, "grad_norm": 0.5086992294791756, "learning_rate": 2.002026376746098e-06, "loss": 0.0452, "step": 11036 }, { "epoch": 0.8592866527176293, "grad_norm": 0.5971925999316222, "learning_rate": 1.9998500159241584e-06, "loss": 0.0657, "step": 11037 }, { "epoch": 0.859364507809839, "grad_norm": 0.4843721190725027, "learning_rate": 1.99767477642695e-06, "loss": 0.0272, "step": 11038 }, { "epoch": 0.8594423629020486, "grad_norm": 0.35746332394640307, "learning_rate": 1.995500658389984e-06, "loss": 0.022, "step": 11039 }, { "epoch": 0.8595202179942582, "grad_norm": 0.35550058476370133, "learning_rate": 1.9933276619486963e-06, "loss": 0.0288, "step": 11040 }, { "epoch": 0.8595980730864679, "grad_norm": 0.4790307266699052, "learning_rate": 1.991155787238459e-06, "loss": 0.0469, "step": 11041 }, { "epoch": 0.8596759281786774, "grad_norm": 0.4887042714777929, "learning_rate": 1.9889850343945662e-06, "loss": 0.0598, "step": 11042 }, { "epoch": 0.859753783270887, "grad_norm": 0.3827488069791821, "learning_rate": 1.9868154035522492e-06, "loss": 0.0342, "step": 11043 }, { "epoch": 0.8598316383630967, "grad_norm": 0.4961806616421024, "learning_rate": 1.9846468948466603e-06, "loss": 0.0549, "step": 11044 }, { "epoch": 0.8599094934553063, "grad_norm": 0.45100979258626867, "learning_rate": 1.982479508412887e-06, "loss": 0.0391, "step": 11045 }, { "epoch": 0.8599873485475159, "grad_norm": 0.5700094323533411, "learning_rate": 1.980313244385956e-06, "loss": 0.0507, "step": 11046 }, { "epoch": 0.8600652036397256, "grad_norm": 0.44519743021921276, "learning_rate": 1.978148102900812e-06, "loss": 0.0466, "step": 11047 }, { "epoch": 0.8601430587319352, "grad_norm": 0.40650557799464687, "learning_rate": 1.9759840840923284e-06, "loss": 0.0351, "step": 11048 }, { "epoch": 0.8602209138241448, "grad_norm": 0.48640504261765116, "learning_rate": 1.973821188095315e-06, "loss": 0.0451, "step": 11049 }, { "epoch": 0.8602987689163545, "grad_norm": 0.41640891364941074, "learning_rate": 1.971659415044507e-06, "loss": 0.0302, "step": 11050 }, { "epoch": 0.8602987689163545, "eval_loss": 0.005364974029362202, "eval_runtime": 162.825, "eval_samples_per_second": 17.688, "eval_steps_per_second": 0.633, "step": 11050 }, { "epoch": 0.8603766240085641, "grad_norm": 0.40811373931569217, "learning_rate": 1.969498765074582e-06, "loss": 0.034, "step": 11051 }, { "epoch": 0.8604544791007737, "grad_norm": 0.3869021810581631, "learning_rate": 1.967339238320132e-06, "loss": 0.0338, "step": 11052 }, { "epoch": 0.8605323341929834, "grad_norm": 0.3606720715196587, "learning_rate": 1.9651808349156854e-06, "loss": 0.0238, "step": 11053 }, { "epoch": 0.860610189285193, "grad_norm": 0.4679429783474499, "learning_rate": 1.9630235549957044e-06, "loss": 0.0389, "step": 11054 }, { "epoch": 0.8606880443774025, "grad_norm": 0.441987014409732, "learning_rate": 1.9608673986945724e-06, "loss": 0.0312, "step": 11055 }, { "epoch": 0.8607658994696122, "grad_norm": 0.40478260667225013, "learning_rate": 1.9587123661466114e-06, "loss": 0.0379, "step": 11056 }, { "epoch": 0.8608437545618218, "grad_norm": 0.41067662437903796, "learning_rate": 1.9565584574860665e-06, "loss": 0.0303, "step": 11057 }, { "epoch": 0.8609216096540314, "grad_norm": 0.29645155881226465, "learning_rate": 1.9544056728471195e-06, "loss": 0.0201, "step": 11058 }, { "epoch": 0.860999464746241, "grad_norm": 0.5127059163425585, "learning_rate": 1.952254012363879e-06, "loss": 0.0447, "step": 11059 }, { "epoch": 0.8610773198384507, "grad_norm": 0.4030435882840127, "learning_rate": 1.9501034761703795e-06, "loss": 0.0379, "step": 11060 }, { "epoch": 0.8611551749306603, "grad_norm": 0.37785370528299306, "learning_rate": 1.9479540644005878e-06, "loss": 0.0308, "step": 11061 }, { "epoch": 0.8612330300228699, "grad_norm": 0.5274884622748721, "learning_rate": 1.94580577718841e-06, "loss": 0.0493, "step": 11062 }, { "epoch": 0.8613108851150796, "grad_norm": 0.45305878369684716, "learning_rate": 1.9436586146676696e-06, "loss": 0.0359, "step": 11063 }, { "epoch": 0.8613887402072892, "grad_norm": 0.4602817044994781, "learning_rate": 1.9415125769721243e-06, "loss": 0.0486, "step": 11064 }, { "epoch": 0.8614665952994988, "grad_norm": 0.47416566818316797, "learning_rate": 1.9393676642354654e-06, "loss": 0.0386, "step": 11065 }, { "epoch": 0.8615444503917085, "grad_norm": 0.3664024631592822, "learning_rate": 1.9372238765913074e-06, "loss": 0.0308, "step": 11066 }, { "epoch": 0.861622305483918, "grad_norm": 0.5493215770947869, "learning_rate": 1.9350812141731976e-06, "loss": 0.0626, "step": 11067 }, { "epoch": 0.8617001605761276, "grad_norm": 0.3759419252678785, "learning_rate": 1.932939677114618e-06, "loss": 0.0315, "step": 11068 }, { "epoch": 0.8617780156683373, "grad_norm": 0.43323216038828455, "learning_rate": 1.9307992655489726e-06, "loss": 0.032, "step": 11069 }, { "epoch": 0.8618558707605469, "grad_norm": 0.4992845993072184, "learning_rate": 1.9286599796096016e-06, "loss": 0.05, "step": 11070 }, { "epoch": 0.8619337258527565, "grad_norm": 0.3946434429718501, "learning_rate": 1.926521819429765e-06, "loss": 0.0301, "step": 11071 }, { "epoch": 0.8620115809449662, "grad_norm": 0.45365290080952125, "learning_rate": 1.924384785142672e-06, "loss": 0.0404, "step": 11072 }, { "epoch": 0.8620894360371758, "grad_norm": 0.4685293473848031, "learning_rate": 1.922248876881443e-06, "loss": 0.0511, "step": 11073 }, { "epoch": 0.8621672911293854, "grad_norm": 0.35390227322802253, "learning_rate": 1.9201140947791375e-06, "loss": 0.026, "step": 11074 }, { "epoch": 0.8622451462215951, "grad_norm": 0.42154683334741067, "learning_rate": 1.9179804389687385e-06, "loss": 0.041, "step": 11075 }, { "epoch": 0.8623230013138047, "grad_norm": 0.40880471091829773, "learning_rate": 1.915847909583168e-06, "loss": 0.0407, "step": 11076 }, { "epoch": 0.8624008564060143, "grad_norm": 0.5717436259296724, "learning_rate": 1.9137165067552675e-06, "loss": 0.0624, "step": 11077 }, { "epoch": 0.862478711498224, "grad_norm": 0.384438001838887, "learning_rate": 1.911586230617817e-06, "loss": 0.0198, "step": 11078 }, { "epoch": 0.8625565665904336, "grad_norm": 0.4124124135783096, "learning_rate": 1.909457081303521e-06, "loss": 0.0339, "step": 11079 }, { "epoch": 0.8626344216826431, "grad_norm": 0.4917054522423206, "learning_rate": 1.9073290589450134e-06, "loss": 0.054, "step": 11080 }, { "epoch": 0.8627122767748528, "grad_norm": 0.40343283460641777, "learning_rate": 1.905202163674862e-06, "loss": 0.0365, "step": 11081 }, { "epoch": 0.8627901318670624, "grad_norm": 0.35600562623228016, "learning_rate": 1.9030763956255648e-06, "loss": 0.0252, "step": 11082 }, { "epoch": 0.862867986959272, "grad_norm": 0.4454216169135332, "learning_rate": 1.9009517549295454e-06, "loss": 0.0243, "step": 11083 }, { "epoch": 0.8629458420514817, "grad_norm": 0.29661825755567206, "learning_rate": 1.8988282417191572e-06, "loss": 0.0169, "step": 11084 }, { "epoch": 0.8630236971436913, "grad_norm": 0.5237351703327999, "learning_rate": 1.8967058561266882e-06, "loss": 0.0509, "step": 11085 }, { "epoch": 0.8631015522359009, "grad_norm": 0.5788605867115428, "learning_rate": 1.8945845982843503e-06, "loss": 0.0553, "step": 11086 }, { "epoch": 0.8631794073281106, "grad_norm": 0.4416610627333424, "learning_rate": 1.892464468324291e-06, "loss": 0.0412, "step": 11087 }, { "epoch": 0.8632572624203202, "grad_norm": 0.49357116775261667, "learning_rate": 1.8903454663785802e-06, "loss": 0.0498, "step": 11088 }, { "epoch": 0.8633351175125298, "grad_norm": 0.4705064529190664, "learning_rate": 1.888227592579226e-06, "loss": 0.0421, "step": 11089 }, { "epoch": 0.8634129726047395, "grad_norm": 0.4923428574376345, "learning_rate": 1.886110847058158e-06, "loss": 0.0429, "step": 11090 }, { "epoch": 0.8634908276969491, "grad_norm": 0.3883349272252264, "learning_rate": 1.8839952299472374e-06, "loss": 0.0422, "step": 11091 }, { "epoch": 0.8635686827891587, "grad_norm": 0.4545443362024494, "learning_rate": 1.881880741378266e-06, "loss": 0.0398, "step": 11092 }, { "epoch": 0.8636465378813682, "grad_norm": 0.2984770497881905, "learning_rate": 1.879767381482962e-06, "loss": 0.013, "step": 11093 }, { "epoch": 0.863724392973578, "grad_norm": 0.5096501812489572, "learning_rate": 1.8776551503929786e-06, "loss": 0.0511, "step": 11094 }, { "epoch": 0.8638022480657875, "grad_norm": 0.4167857239225412, "learning_rate": 1.875544048239899e-06, "loss": 0.0394, "step": 11095 }, { "epoch": 0.8638801031579971, "grad_norm": 0.43156752241099694, "learning_rate": 1.8734340751552293e-06, "loss": 0.033, "step": 11096 }, { "epoch": 0.8639579582502068, "grad_norm": 0.44967159699409975, "learning_rate": 1.8713252312704088e-06, "loss": 0.0412, "step": 11097 }, { "epoch": 0.8640358133424164, "grad_norm": 0.535117590789304, "learning_rate": 1.8692175167168193e-06, "loss": 0.0554, "step": 11098 }, { "epoch": 0.864113668434626, "grad_norm": 0.5521396221732053, "learning_rate": 1.8671109316257575e-06, "loss": 0.0595, "step": 11099 }, { "epoch": 0.8641915235268357, "grad_norm": 0.44004075833667955, "learning_rate": 1.865005476128452e-06, "loss": 0.0429, "step": 11100 }, { "epoch": 0.8641915235268357, "eval_loss": 0.00531988637521863, "eval_runtime": 162.242, "eval_samples_per_second": 17.751, "eval_steps_per_second": 0.635, "step": 11100 }, { "epoch": 0.8642693786190453, "grad_norm": 0.39613268734624035, "learning_rate": 1.8629011503560623e-06, "loss": 0.0254, "step": 11101 }, { "epoch": 0.8643472337112549, "grad_norm": 0.44583899026197543, "learning_rate": 1.8607979544396747e-06, "loss": 0.0323, "step": 11102 }, { "epoch": 0.8644250888034646, "grad_norm": 0.4404474386258369, "learning_rate": 1.8586958885103179e-06, "loss": 0.0396, "step": 11103 }, { "epoch": 0.8645029438956742, "grad_norm": 0.37227928000957194, "learning_rate": 1.8565949526989403e-06, "loss": 0.0249, "step": 11104 }, { "epoch": 0.8645807989878838, "grad_norm": 0.4488999004367777, "learning_rate": 1.8544951471364103e-06, "loss": 0.0443, "step": 11105 }, { "epoch": 0.8646586540800935, "grad_norm": 0.3883704394414828, "learning_rate": 1.8523964719535415e-06, "loss": 0.0224, "step": 11106 }, { "epoch": 0.864736509172303, "grad_norm": 0.31979470532630017, "learning_rate": 1.8502989272810668e-06, "loss": 0.0199, "step": 11107 }, { "epoch": 0.8648143642645126, "grad_norm": 0.40168625918507106, "learning_rate": 1.8482025132496639e-06, "loss": 0.0293, "step": 11108 }, { "epoch": 0.8648922193567223, "grad_norm": 0.4336062689969766, "learning_rate": 1.846107229989922e-06, "loss": 0.0431, "step": 11109 }, { "epoch": 0.8649700744489319, "grad_norm": 0.4826832039429231, "learning_rate": 1.8440130776323672e-06, "loss": 0.0439, "step": 11110 }, { "epoch": 0.8650479295411415, "grad_norm": 0.43232185086891467, "learning_rate": 1.8419200563074603e-06, "loss": 0.0323, "step": 11111 }, { "epoch": 0.8651257846333512, "grad_norm": 0.3635015338564324, "learning_rate": 1.8398281661455764e-06, "loss": 0.0215, "step": 11112 }, { "epoch": 0.8652036397255608, "grad_norm": 0.4305223171914786, "learning_rate": 1.8377374072770472e-06, "loss": 0.0399, "step": 11113 }, { "epoch": 0.8652814948177704, "grad_norm": 0.47001992220398914, "learning_rate": 1.8356477798321015e-06, "loss": 0.0563, "step": 11114 }, { "epoch": 0.8653593499099801, "grad_norm": 0.4024901238881051, "learning_rate": 1.8335592839409199e-06, "loss": 0.0316, "step": 11115 }, { "epoch": 0.8654372050021897, "grad_norm": 0.39933708119155403, "learning_rate": 1.8314719197336072e-06, "loss": 0.0303, "step": 11116 }, { "epoch": 0.8655150600943993, "grad_norm": 0.5310420665870065, "learning_rate": 1.8293856873401882e-06, "loss": 0.048, "step": 11117 }, { "epoch": 0.865592915186609, "grad_norm": 0.3863394423731884, "learning_rate": 1.8273005868906368e-06, "loss": 0.0289, "step": 11118 }, { "epoch": 0.8656707702788186, "grad_norm": 0.38132447452257956, "learning_rate": 1.825216618514838e-06, "loss": 0.0247, "step": 11119 }, { "epoch": 0.8657486253710281, "grad_norm": 0.5987650854361729, "learning_rate": 1.8231337823426143e-06, "loss": 0.0413, "step": 11120 }, { "epoch": 0.8658264804632378, "grad_norm": 0.4974601297022848, "learning_rate": 1.82105207850372e-06, "loss": 0.0464, "step": 11121 }, { "epoch": 0.8659043355554474, "grad_norm": 0.7675067071309413, "learning_rate": 1.8189715071278314e-06, "loss": 0.0854, "step": 11122 }, { "epoch": 0.865982190647657, "grad_norm": 0.4032784850213941, "learning_rate": 1.816892068344558e-06, "loss": 0.0321, "step": 11123 }, { "epoch": 0.8660600457398667, "grad_norm": 0.6205042952963005, "learning_rate": 1.8148137622834404e-06, "loss": 0.06, "step": 11124 }, { "epoch": 0.8661379008320763, "grad_norm": 0.43252430318511087, "learning_rate": 1.812736589073949e-06, "loss": 0.0341, "step": 11125 }, { "epoch": 0.8662157559242859, "grad_norm": 0.34133234734709184, "learning_rate": 1.810660548845482e-06, "loss": 0.0242, "step": 11126 }, { "epoch": 0.8662936110164956, "grad_norm": 0.38045928505164806, "learning_rate": 1.8085856417273585e-06, "loss": 0.0289, "step": 11127 }, { "epoch": 0.8663714661087052, "grad_norm": 0.6040946050043502, "learning_rate": 1.806511867848848e-06, "loss": 0.0707, "step": 11128 }, { "epoch": 0.8664493212009148, "grad_norm": 0.40521216437319935, "learning_rate": 1.80443922733913e-06, "loss": 0.0331, "step": 11129 }, { "epoch": 0.8665271762931244, "grad_norm": 0.5128906995032453, "learning_rate": 1.8023677203273226e-06, "loss": 0.0565, "step": 11130 }, { "epoch": 0.8666050313853341, "grad_norm": 0.5894565660679105, "learning_rate": 1.8002973469424678e-06, "loss": 0.0619, "step": 11131 }, { "epoch": 0.8666828864775437, "grad_norm": 0.44359743562764403, "learning_rate": 1.7982281073135443e-06, "loss": 0.0378, "step": 11132 }, { "epoch": 0.8667607415697532, "grad_norm": 0.3596701749441035, "learning_rate": 1.7961600015694536e-06, "loss": 0.0233, "step": 11133 }, { "epoch": 0.866838596661963, "grad_norm": 0.4269417193299835, "learning_rate": 1.794093029839028e-06, "loss": 0.0407, "step": 11134 }, { "epoch": 0.8669164517541725, "grad_norm": 0.4713690165218403, "learning_rate": 1.7920271922510313e-06, "loss": 0.0414, "step": 11135 }, { "epoch": 0.8669943068463821, "grad_norm": 0.38765443012214784, "learning_rate": 1.789962488934156e-06, "loss": 0.0312, "step": 11136 }, { "epoch": 0.8670721619385918, "grad_norm": 0.4394681616719035, "learning_rate": 1.7878989200170238e-06, "loss": 0.0406, "step": 11137 }, { "epoch": 0.8671500170308014, "grad_norm": 0.5033408624991829, "learning_rate": 1.7858364856281806e-06, "loss": 0.0402, "step": 11138 }, { "epoch": 0.867227872123011, "grad_norm": 0.42948753004394297, "learning_rate": 1.7837751858961128e-06, "loss": 0.0327, "step": 11139 }, { "epoch": 0.8673057272152207, "grad_norm": 0.4243999922116611, "learning_rate": 1.7817150209492284e-06, "loss": 0.0392, "step": 11140 }, { "epoch": 0.8673835823074303, "grad_norm": 0.3854105304916841, "learning_rate": 1.7796559909158627e-06, "loss": 0.0254, "step": 11141 }, { "epoch": 0.8674614373996399, "grad_norm": 0.435168187820823, "learning_rate": 1.7775980959242867e-06, "loss": 0.0341, "step": 11142 }, { "epoch": 0.8675392924918496, "grad_norm": 0.32065169399348875, "learning_rate": 1.7755413361026975e-06, "loss": 0.0236, "step": 11143 }, { "epoch": 0.8676171475840592, "grad_norm": 0.39934819644323316, "learning_rate": 1.7734857115792193e-06, "loss": 0.0386, "step": 11144 }, { "epoch": 0.8676950026762688, "grad_norm": 0.6675459983123514, "learning_rate": 1.7714312224819097e-06, "loss": 0.074, "step": 11145 }, { "epoch": 0.8677728577684785, "grad_norm": 0.45623439612058325, "learning_rate": 1.769377868938753e-06, "loss": 0.0458, "step": 11146 }, { "epoch": 0.867850712860688, "grad_norm": 0.5121921781943539, "learning_rate": 1.7673256510776625e-06, "loss": 0.0522, "step": 11147 }, { "epoch": 0.8679285679528976, "grad_norm": 0.4787115952120446, "learning_rate": 1.7652745690264806e-06, "loss": 0.0454, "step": 11148 }, { "epoch": 0.8680064230451073, "grad_norm": 0.3084422479514402, "learning_rate": 1.7632246229129845e-06, "loss": 0.0178, "step": 11149 }, { "epoch": 0.8680842781373169, "grad_norm": 0.4843670941215964, "learning_rate": 1.7611758128648747e-06, "loss": 0.0506, "step": 11150 }, { "epoch": 0.8680842781373169, "eval_loss": 0.005310529377311468, "eval_runtime": 162.8035, "eval_samples_per_second": 17.69, "eval_steps_per_second": 0.633, "step": 11150 }, { "epoch": 0.8681621332295265, "grad_norm": 0.4138878688701613, "learning_rate": 1.7591281390097825e-06, "loss": 0.0303, "step": 11151 }, { "epoch": 0.8682399883217362, "grad_norm": 0.501363093655133, "learning_rate": 1.7570816014752657e-06, "loss": 0.0488, "step": 11152 }, { "epoch": 0.8683178434139458, "grad_norm": 0.6333355756039025, "learning_rate": 1.7550362003888177e-06, "loss": 0.0681, "step": 11153 }, { "epoch": 0.8683956985061554, "grad_norm": 0.48241504932759754, "learning_rate": 1.7529919358778546e-06, "loss": 0.0499, "step": 11154 }, { "epoch": 0.8684735535983651, "grad_norm": 0.457960072047396, "learning_rate": 1.7509488080697257e-06, "loss": 0.0474, "step": 11155 }, { "epoch": 0.8685514086905747, "grad_norm": 0.41883510291358017, "learning_rate": 1.748906817091709e-06, "loss": 0.0386, "step": 11156 }, { "epoch": 0.8686292637827843, "grad_norm": 0.38484075654606537, "learning_rate": 1.7468659630710072e-06, "loss": 0.0344, "step": 11157 }, { "epoch": 0.868707118874994, "grad_norm": 0.5783216521121656, "learning_rate": 1.7448262461347565e-06, "loss": 0.0662, "step": 11158 }, { "epoch": 0.8687849739672036, "grad_norm": 0.3963140845194436, "learning_rate": 1.7427876664100285e-06, "loss": 0.0309, "step": 11159 }, { "epoch": 0.8688628290594131, "grad_norm": 0.3443448592825108, "learning_rate": 1.7407502240238105e-06, "loss": 0.0217, "step": 11160 }, { "epoch": 0.8689406841516228, "grad_norm": 0.5403188334194275, "learning_rate": 1.7387139191030278e-06, "loss": 0.0687, "step": 11161 }, { "epoch": 0.8690185392438324, "grad_norm": 0.3547630472458451, "learning_rate": 1.7366787517745364e-06, "loss": 0.025, "step": 11162 }, { "epoch": 0.869096394336042, "grad_norm": 0.5006867263133474, "learning_rate": 1.7346447221651063e-06, "loss": 0.0461, "step": 11163 }, { "epoch": 0.8691742494282517, "grad_norm": 0.45114834812117893, "learning_rate": 1.7326118304014604e-06, "loss": 0.042, "step": 11164 }, { "epoch": 0.8692521045204613, "grad_norm": 0.3070200688725562, "learning_rate": 1.730580076610231e-06, "loss": 0.0186, "step": 11165 }, { "epoch": 0.8693299596126709, "grad_norm": 0.3976955507402723, "learning_rate": 1.7285494609179899e-06, "loss": 0.0276, "step": 11166 }, { "epoch": 0.8694078147048805, "grad_norm": 0.39361911785707077, "learning_rate": 1.7265199834512336e-06, "loss": 0.0318, "step": 11167 }, { "epoch": 0.8694856697970902, "grad_norm": 0.49616868149837595, "learning_rate": 1.7244916443363857e-06, "loss": 0.0479, "step": 11168 }, { "epoch": 0.8695635248892998, "grad_norm": 0.4930449335549058, "learning_rate": 1.7224644436998094e-06, "loss": 0.0446, "step": 11169 }, { "epoch": 0.8696413799815094, "grad_norm": 0.43275326209421194, "learning_rate": 1.720438381667786e-06, "loss": 0.0316, "step": 11170 }, { "epoch": 0.8697192350737191, "grad_norm": 0.4348860458027033, "learning_rate": 1.7184134583665347e-06, "loss": 0.0375, "step": 11171 }, { "epoch": 0.8697970901659287, "grad_norm": 0.39064787194889405, "learning_rate": 1.7163896739221897e-06, "loss": 0.0305, "step": 11172 }, { "epoch": 0.8698749452581382, "grad_norm": 0.36581094254034674, "learning_rate": 1.7143670284608282e-06, "loss": 0.0326, "step": 11173 }, { "epoch": 0.8699528003503479, "grad_norm": 0.5715048832949599, "learning_rate": 1.712345522108445e-06, "loss": 0.0712, "step": 11174 }, { "epoch": 0.8700306554425575, "grad_norm": 0.4738780035574925, "learning_rate": 1.7103251549909817e-06, "loss": 0.0496, "step": 11175 }, { "epoch": 0.8701085105347671, "grad_norm": 0.5323571185832124, "learning_rate": 1.7083059272342927e-06, "loss": 0.0539, "step": 11176 }, { "epoch": 0.8701863656269768, "grad_norm": 0.33277285441102283, "learning_rate": 1.7062878389641647e-06, "loss": 0.0218, "step": 11177 }, { "epoch": 0.8702642207191864, "grad_norm": 0.3521061989222251, "learning_rate": 1.7042708903063165e-06, "loss": 0.0312, "step": 11178 }, { "epoch": 0.870342075811396, "grad_norm": 0.4037870018556065, "learning_rate": 1.7022550813863904e-06, "loss": 0.024, "step": 11179 }, { "epoch": 0.8704199309036057, "grad_norm": 0.36288247804094903, "learning_rate": 1.7002404123299722e-06, "loss": 0.032, "step": 11180 }, { "epoch": 0.8704977859958153, "grad_norm": 0.39502416068444535, "learning_rate": 1.698226883262557e-06, "loss": 0.0344, "step": 11181 }, { "epoch": 0.8705756410880249, "grad_norm": 0.42462204769670653, "learning_rate": 1.6962144943095827e-06, "loss": 0.0298, "step": 11182 }, { "epoch": 0.8706534961802346, "grad_norm": 0.4684075048720177, "learning_rate": 1.6942032455964086e-06, "loss": 0.0415, "step": 11183 }, { "epoch": 0.8707313512724442, "grad_norm": 0.4514969743823264, "learning_rate": 1.6921931372483237e-06, "loss": 0.0438, "step": 11184 }, { "epoch": 0.8708092063646538, "grad_norm": 0.5008841221718688, "learning_rate": 1.6901841693905542e-06, "loss": 0.0507, "step": 11185 }, { "epoch": 0.8708870614568635, "grad_norm": 0.43820060968978564, "learning_rate": 1.688176342148249e-06, "loss": 0.0349, "step": 11186 }, { "epoch": 0.870964916549073, "grad_norm": 0.4269061847430256, "learning_rate": 1.6861696556464858e-06, "loss": 0.0425, "step": 11187 }, { "epoch": 0.8710427716412826, "grad_norm": 0.4722064905620038, "learning_rate": 1.684164110010269e-06, "loss": 0.0492, "step": 11188 }, { "epoch": 0.8711206267334923, "grad_norm": 0.49460019105585273, "learning_rate": 1.6821597053645344e-06, "loss": 0.0428, "step": 11189 }, { "epoch": 0.8711984818257019, "grad_norm": 0.5099715065257276, "learning_rate": 1.680156441834151e-06, "loss": 0.0461, "step": 11190 }, { "epoch": 0.8712763369179115, "grad_norm": 0.5468773420509208, "learning_rate": 1.6781543195439098e-06, "loss": 0.0559, "step": 11191 }, { "epoch": 0.8713541920101212, "grad_norm": 0.411759283654743, "learning_rate": 1.6761533386185336e-06, "loss": 0.036, "step": 11192 }, { "epoch": 0.8714320471023308, "grad_norm": 0.32318991706465616, "learning_rate": 1.6741534991826758e-06, "loss": 0.0193, "step": 11193 }, { "epoch": 0.8715099021945404, "grad_norm": 0.5017570743679216, "learning_rate": 1.6721548013609124e-06, "loss": 0.0507, "step": 11194 }, { "epoch": 0.8715877572867501, "grad_norm": 0.42814042463899793, "learning_rate": 1.6701572452777592e-06, "loss": 0.0342, "step": 11195 }, { "epoch": 0.8716656123789597, "grad_norm": 0.40481127010177337, "learning_rate": 1.6681608310576525e-06, "loss": 0.024, "step": 11196 }, { "epoch": 0.8717434674711693, "grad_norm": 0.39870339316916353, "learning_rate": 1.666165558824957e-06, "loss": 0.0283, "step": 11197 }, { "epoch": 0.871821322563379, "grad_norm": 0.4819295961956377, "learning_rate": 1.6641714287039712e-06, "loss": 0.0422, "step": 11198 }, { "epoch": 0.8718991776555886, "grad_norm": 0.6059640202613551, "learning_rate": 1.66217844081892e-06, "loss": 0.0576, "step": 11199 }, { "epoch": 0.8719770327477981, "grad_norm": 0.3868274774620561, "learning_rate": 1.6601865952939577e-06, "loss": 0.0301, "step": 11200 }, { "epoch": 0.8719770327477981, "eval_loss": 0.005237857345491648, "eval_runtime": 162.0731, "eval_samples_per_second": 17.77, "eval_steps_per_second": 0.636, "step": 11200 }, { "epoch": 0.8720548878400077, "grad_norm": 0.3773638833167897, "learning_rate": 1.658195892253165e-06, "loss": 0.0308, "step": 11201 }, { "epoch": 0.8721327429322174, "grad_norm": 0.4462754271390219, "learning_rate": 1.6562063318205535e-06, "loss": 0.0413, "step": 11202 }, { "epoch": 0.872210598024427, "grad_norm": 0.4758955065049234, "learning_rate": 1.6542179141200643e-06, "loss": 0.0438, "step": 11203 }, { "epoch": 0.8722884531166366, "grad_norm": 0.41359307017781854, "learning_rate": 1.6522306392755628e-06, "loss": 0.0328, "step": 11204 }, { "epoch": 0.8723663082088463, "grad_norm": 0.4336240265115223, "learning_rate": 1.6502445074108543e-06, "loss": 0.0368, "step": 11205 }, { "epoch": 0.8724441633010559, "grad_norm": 0.5499658923484708, "learning_rate": 1.6482595186496597e-06, "loss": 0.0617, "step": 11206 }, { "epoch": 0.8725220183932655, "grad_norm": 0.32515528136798066, "learning_rate": 1.6462756731156382e-06, "loss": 0.0257, "step": 11207 }, { "epoch": 0.8725998734854752, "grad_norm": 0.4582766049220839, "learning_rate": 1.6442929709323707e-06, "loss": 0.0514, "step": 11208 }, { "epoch": 0.8726777285776848, "grad_norm": 0.4888328052751259, "learning_rate": 1.642311412223372e-06, "loss": 0.052, "step": 11209 }, { "epoch": 0.8727555836698944, "grad_norm": 0.3704955554559461, "learning_rate": 1.640330997112085e-06, "loss": 0.03, "step": 11210 }, { "epoch": 0.8728334387621041, "grad_norm": 0.3529567717672805, "learning_rate": 1.6383517257218785e-06, "loss": 0.0277, "step": 11211 }, { "epoch": 0.8729112938543137, "grad_norm": 0.36617249934373364, "learning_rate": 1.636373598176051e-06, "loss": 0.0299, "step": 11212 }, { "epoch": 0.8729891489465232, "grad_norm": 0.4068423928644254, "learning_rate": 1.6343966145978329e-06, "loss": 0.0329, "step": 11213 }, { "epoch": 0.8730670040387329, "grad_norm": 0.4230301326319881, "learning_rate": 1.6324207751103772e-06, "loss": 0.0425, "step": 11214 }, { "epoch": 0.8731448591309425, "grad_norm": 0.5068779846663487, "learning_rate": 1.6304460798367716e-06, "loss": 0.055, "step": 11215 }, { "epoch": 0.8732227142231521, "grad_norm": 0.44229171607684986, "learning_rate": 1.6284725289000314e-06, "loss": 0.0412, "step": 11216 }, { "epoch": 0.8733005693153618, "grad_norm": 0.46546121975772314, "learning_rate": 1.6265001224231002e-06, "loss": 0.0414, "step": 11217 }, { "epoch": 0.8733784244075714, "grad_norm": 0.37767823671898537, "learning_rate": 1.624528860528849e-06, "loss": 0.0273, "step": 11218 }, { "epoch": 0.873456279499781, "grad_norm": 0.549762416758808, "learning_rate": 1.6225587433400768e-06, "loss": 0.0544, "step": 11219 }, { "epoch": 0.8735341345919907, "grad_norm": 0.3559606738644557, "learning_rate": 1.6205897709795148e-06, "loss": 0.0308, "step": 11220 }, { "epoch": 0.8736119896842003, "grad_norm": 0.3615999118053804, "learning_rate": 1.618621943569816e-06, "loss": 0.0267, "step": 11221 }, { "epoch": 0.8736898447764099, "grad_norm": 0.4099483975907969, "learning_rate": 1.6166552612335729e-06, "loss": 0.0358, "step": 11222 }, { "epoch": 0.8737676998686196, "grad_norm": 0.6067479852902192, "learning_rate": 1.6146897240932968e-06, "loss": 0.0744, "step": 11223 }, { "epoch": 0.8738455549608292, "grad_norm": 0.3819640925547387, "learning_rate": 1.612725332271432e-06, "loss": 0.0263, "step": 11224 }, { "epoch": 0.8739234100530388, "grad_norm": 0.34997550721956955, "learning_rate": 1.6107620858903471e-06, "loss": 0.0273, "step": 11225 }, { "epoch": 0.8740012651452485, "grad_norm": 0.4723361347216817, "learning_rate": 1.6087999850723512e-06, "loss": 0.0382, "step": 11226 }, { "epoch": 0.874079120237458, "grad_norm": 0.6298258040519308, "learning_rate": 1.6068390299396709e-06, "loss": 0.0653, "step": 11227 }, { "epoch": 0.8741569753296676, "grad_norm": 0.403333474717166, "learning_rate": 1.6048792206144615e-06, "loss": 0.0281, "step": 11228 }, { "epoch": 0.8742348304218773, "grad_norm": 0.400367101658812, "learning_rate": 1.6029205572188166e-06, "loss": 0.0274, "step": 11229 }, { "epoch": 0.8743126855140869, "grad_norm": 0.38816501161451605, "learning_rate": 1.6009630398747412e-06, "loss": 0.0354, "step": 11230 }, { "epoch": 0.8743905406062965, "grad_norm": 0.3499453700029088, "learning_rate": 1.5990066687041884e-06, "loss": 0.0255, "step": 11231 }, { "epoch": 0.8744683956985062, "grad_norm": 0.46562850678966894, "learning_rate": 1.5970514438290253e-06, "loss": 0.0486, "step": 11232 }, { "epoch": 0.8745462507907158, "grad_norm": 0.46344696907597654, "learning_rate": 1.595097365371059e-06, "loss": 0.0452, "step": 11233 }, { "epoch": 0.8746241058829254, "grad_norm": 0.42900382156676425, "learning_rate": 1.593144433452014e-06, "loss": 0.0385, "step": 11234 }, { "epoch": 0.8747019609751351, "grad_norm": 0.336145898757479, "learning_rate": 1.591192648193549e-06, "loss": 0.0232, "step": 11235 }, { "epoch": 0.8747798160673447, "grad_norm": 0.4207510291039711, "learning_rate": 1.5892420097172556e-06, "loss": 0.0379, "step": 11236 }, { "epoch": 0.8748576711595543, "grad_norm": 0.3498778308289276, "learning_rate": 1.5872925181446453e-06, "loss": 0.0302, "step": 11237 }, { "epoch": 0.8749355262517639, "grad_norm": 0.490400075264595, "learning_rate": 1.5853441735971698e-06, "loss": 0.0482, "step": 11238 }, { "epoch": 0.8750133813439736, "grad_norm": 0.32426535608041157, "learning_rate": 1.583396976196192e-06, "loss": 0.021, "step": 11239 }, { "epoch": 0.8750912364361831, "grad_norm": 0.4240573576159887, "learning_rate": 1.5814509260630128e-06, "loss": 0.0331, "step": 11240 }, { "epoch": 0.8751690915283927, "grad_norm": 0.5018064807452401, "learning_rate": 1.5795060233188685e-06, "loss": 0.0488, "step": 11241 }, { "epoch": 0.8752469466206024, "grad_norm": 0.5259408157961786, "learning_rate": 1.5775622680849156e-06, "loss": 0.05, "step": 11242 }, { "epoch": 0.875324801712812, "grad_norm": 0.3564002529268089, "learning_rate": 1.5756196604822416e-06, "loss": 0.0253, "step": 11243 }, { "epoch": 0.8754026568050216, "grad_norm": 0.5337556309674282, "learning_rate": 1.5736782006318586e-06, "loss": 0.0529, "step": 11244 }, { "epoch": 0.8754805118972313, "grad_norm": 0.4576473265348401, "learning_rate": 1.5717378886547096e-06, "loss": 0.0409, "step": 11245 }, { "epoch": 0.8755583669894409, "grad_norm": 0.38443835506339924, "learning_rate": 1.5697987246716739e-06, "loss": 0.0292, "step": 11246 }, { "epoch": 0.8756362220816505, "grad_norm": 0.5071108721847424, "learning_rate": 1.5678607088035503e-06, "loss": 0.0498, "step": 11247 }, { "epoch": 0.8757140771738602, "grad_norm": 0.3063417765699831, "learning_rate": 1.5659238411710641e-06, "loss": 0.0221, "step": 11248 }, { "epoch": 0.8757919322660698, "grad_norm": 0.40067701871780037, "learning_rate": 1.5639881218948727e-06, "loss": 0.0353, "step": 11249 }, { "epoch": 0.8758697873582794, "grad_norm": 0.5002636468279665, "learning_rate": 1.5620535510955659e-06, "loss": 0.0448, "step": 11250 }, { "epoch": 0.8758697873582794, "eval_loss": 0.005182578228414059, "eval_runtime": 161.8986, "eval_samples_per_second": 17.789, "eval_steps_per_second": 0.636, "step": 11250 }, { "epoch": 0.8759476424504891, "grad_norm": 0.5459367570750938, "learning_rate": 1.5601201288936541e-06, "loss": 0.054, "step": 11251 }, { "epoch": 0.8760254975426987, "grad_norm": 0.3905334016699885, "learning_rate": 1.5581878554095854e-06, "loss": 0.027, "step": 11252 }, { "epoch": 0.8761033526349082, "grad_norm": 0.414629950721322, "learning_rate": 1.5562567307637277e-06, "loss": 0.0318, "step": 11253 }, { "epoch": 0.8761812077271179, "grad_norm": 0.42405215115962086, "learning_rate": 1.5543267550763853e-06, "loss": 0.04, "step": 11254 }, { "epoch": 0.8762590628193275, "grad_norm": 0.4881894288117456, "learning_rate": 1.5523979284677814e-06, "loss": 0.0459, "step": 11255 }, { "epoch": 0.8763369179115371, "grad_norm": 0.3796870920303507, "learning_rate": 1.550470251058076e-06, "loss": 0.0338, "step": 11256 }, { "epoch": 0.8764147730037468, "grad_norm": 0.3396726652302009, "learning_rate": 1.5485437229673528e-06, "loss": 0.0265, "step": 11257 }, { "epoch": 0.8764926280959564, "grad_norm": 0.46632059487563854, "learning_rate": 1.546618344315629e-06, "loss": 0.0464, "step": 11258 }, { "epoch": 0.876570483188166, "grad_norm": 0.390700237602951, "learning_rate": 1.5446941152228401e-06, "loss": 0.0304, "step": 11259 }, { "epoch": 0.8766483382803757, "grad_norm": 0.41847530620330137, "learning_rate": 1.5427710358088632e-06, "loss": 0.0375, "step": 11260 }, { "epoch": 0.8767261933725853, "grad_norm": 0.4230658730196165, "learning_rate": 1.5408491061934894e-06, "loss": 0.0303, "step": 11261 }, { "epoch": 0.8768040484647949, "grad_norm": 0.4927955404822766, "learning_rate": 1.5389283264964539e-06, "loss": 0.0533, "step": 11262 }, { "epoch": 0.8768819035570046, "grad_norm": 0.30247023191164046, "learning_rate": 1.5370086968374075e-06, "loss": 0.0197, "step": 11263 }, { "epoch": 0.8769597586492142, "grad_norm": 0.39705400093726034, "learning_rate": 1.5350902173359372e-06, "loss": 0.0351, "step": 11264 }, { "epoch": 0.8770376137414237, "grad_norm": 0.5292941859006157, "learning_rate": 1.5331728881115537e-06, "loss": 0.0495, "step": 11265 }, { "epoch": 0.8771154688336334, "grad_norm": 0.7218058837520981, "learning_rate": 1.5312567092836972e-06, "loss": 0.0993, "step": 11266 }, { "epoch": 0.877193323925843, "grad_norm": 0.41863643606768197, "learning_rate": 1.5293416809717388e-06, "loss": 0.0318, "step": 11267 }, { "epoch": 0.8772711790180526, "grad_norm": 0.5750167752904377, "learning_rate": 1.527427803294972e-06, "loss": 0.0678, "step": 11268 }, { "epoch": 0.8773490341102623, "grad_norm": 0.39875329262108017, "learning_rate": 1.5255150763726234e-06, "loss": 0.0277, "step": 11269 }, { "epoch": 0.8774268892024719, "grad_norm": 0.51624393679211, "learning_rate": 1.5236035003238492e-06, "loss": 0.0436, "step": 11270 }, { "epoch": 0.8775047442946815, "grad_norm": 0.39935471282849566, "learning_rate": 1.5216930752677272e-06, "loss": 0.0339, "step": 11271 }, { "epoch": 0.8775825993868911, "grad_norm": 0.2814078714522095, "learning_rate": 1.5197838013232736e-06, "loss": 0.019, "step": 11272 }, { "epoch": 0.8776604544791008, "grad_norm": 0.4645829330741123, "learning_rate": 1.5178756786094239e-06, "loss": 0.0457, "step": 11273 }, { "epoch": 0.8777383095713104, "grad_norm": 0.4121481505790328, "learning_rate": 1.515968707245048e-06, "loss": 0.0402, "step": 11274 }, { "epoch": 0.87781616466352, "grad_norm": 0.37847268530552214, "learning_rate": 1.5140628873489371e-06, "loss": 0.0402, "step": 11275 }, { "epoch": 0.8778940197557297, "grad_norm": 0.33334765319217863, "learning_rate": 1.5121582190398165e-06, "loss": 0.0293, "step": 11276 }, { "epoch": 0.8779718748479393, "grad_norm": 0.3601624642931581, "learning_rate": 1.5102547024363378e-06, "loss": 0.0277, "step": 11277 }, { "epoch": 0.8780497299401488, "grad_norm": 0.5020263581820238, "learning_rate": 1.5083523376570842e-06, "loss": 0.054, "step": 11278 }, { "epoch": 0.8781275850323585, "grad_norm": 0.4141435977403228, "learning_rate": 1.5064511248205605e-06, "loss": 0.0375, "step": 11279 }, { "epoch": 0.8782054401245681, "grad_norm": 0.3925163292763131, "learning_rate": 1.5045510640452033e-06, "loss": 0.0328, "step": 11280 }, { "epoch": 0.8782832952167777, "grad_norm": 0.5098781203451663, "learning_rate": 1.5026521554493756e-06, "loss": 0.0425, "step": 11281 }, { "epoch": 0.8783611503089874, "grad_norm": 0.42821456751931253, "learning_rate": 1.5007543991513762e-06, "loss": 0.0389, "step": 11282 }, { "epoch": 0.878439005401197, "grad_norm": 0.4698992802198139, "learning_rate": 1.4988577952694239e-06, "loss": 0.0459, "step": 11283 }, { "epoch": 0.8785168604934066, "grad_norm": 0.44819054146059745, "learning_rate": 1.4969623439216685e-06, "loss": 0.0424, "step": 11284 }, { "epoch": 0.8785947155856163, "grad_norm": 0.4826976387604964, "learning_rate": 1.495068045226189e-06, "loss": 0.0453, "step": 11285 }, { "epoch": 0.8786725706778259, "grad_norm": 0.3854789391678489, "learning_rate": 1.4931748993009888e-06, "loss": 0.03, "step": 11286 }, { "epoch": 0.8787504257700355, "grad_norm": 0.6656099480564134, "learning_rate": 1.4912829062639978e-06, "loss": 0.075, "step": 11287 }, { "epoch": 0.8788282808622452, "grad_norm": 0.4294724606848606, "learning_rate": 1.4893920662330885e-06, "loss": 0.0402, "step": 11288 }, { "epoch": 0.8789061359544548, "grad_norm": 0.5089269125127373, "learning_rate": 1.4875023793260423e-06, "loss": 0.0395, "step": 11289 }, { "epoch": 0.8789839910466644, "grad_norm": 0.4839942983146161, "learning_rate": 1.485613845660583e-06, "loss": 0.0453, "step": 11290 }, { "epoch": 0.8790618461388741, "grad_norm": 0.4474581557876423, "learning_rate": 1.4837264653543559e-06, "loss": 0.0383, "step": 11291 }, { "epoch": 0.8791397012310836, "grad_norm": 0.5117701535241868, "learning_rate": 1.4818402385249343e-06, "loss": 0.0544, "step": 11292 }, { "epoch": 0.8792175563232932, "grad_norm": 0.5586802737003302, "learning_rate": 1.4799551652898237e-06, "loss": 0.0554, "step": 11293 }, { "epoch": 0.8792954114155029, "grad_norm": 0.4518231610001344, "learning_rate": 1.4780712457664548e-06, "loss": 0.0444, "step": 11294 }, { "epoch": 0.8793732665077125, "grad_norm": 0.3601067579946018, "learning_rate": 1.4761884800721893e-06, "loss": 0.0232, "step": 11295 }, { "epoch": 0.8794511215999221, "grad_norm": 0.3804006575485091, "learning_rate": 1.474306868324309e-06, "loss": 0.0231, "step": 11296 }, { "epoch": 0.8795289766921318, "grad_norm": 0.34389918852032136, "learning_rate": 1.472426410640031e-06, "loss": 0.0241, "step": 11297 }, { "epoch": 0.8796068317843414, "grad_norm": 0.41443633523752493, "learning_rate": 1.470547107136502e-06, "loss": 0.0325, "step": 11298 }, { "epoch": 0.879684686876551, "grad_norm": 0.42853921882931634, "learning_rate": 1.4686689579307922e-06, "loss": 0.0324, "step": 11299 }, { "epoch": 0.8797625419687607, "grad_norm": 0.4890502666152363, "learning_rate": 1.4667919631398996e-06, "loss": 0.0563, "step": 11300 }, { "epoch": 0.8797625419687607, "eval_loss": 0.005101290997117758, "eval_runtime": 163.0898, "eval_samples_per_second": 17.659, "eval_steps_per_second": 0.632, "step": 11300 }, { "epoch": 0.8798403970609703, "grad_norm": 0.46999146594329894, "learning_rate": 1.4649161228807573e-06, "loss": 0.0467, "step": 11301 }, { "epoch": 0.8799182521531799, "grad_norm": 0.4893884038837823, "learning_rate": 1.4630414372702118e-06, "loss": 0.0566, "step": 11302 }, { "epoch": 0.8799961072453896, "grad_norm": 0.3688490669839151, "learning_rate": 1.4611679064250584e-06, "loss": 0.0272, "step": 11303 }, { "epoch": 0.8800739623375992, "grad_norm": 0.4466912551509967, "learning_rate": 1.4592955304620038e-06, "loss": 0.0447, "step": 11304 }, { "epoch": 0.8801518174298087, "grad_norm": 0.4360512024813704, "learning_rate": 1.4574243094976926e-06, "loss": 0.042, "step": 11305 }, { "epoch": 0.8802296725220184, "grad_norm": 0.32031418168275866, "learning_rate": 1.4555542436486847e-06, "loss": 0.0262, "step": 11306 }, { "epoch": 0.880307527614228, "grad_norm": 0.45919513266740697, "learning_rate": 1.453685333031476e-06, "loss": 0.0519, "step": 11307 }, { "epoch": 0.8803853827064376, "grad_norm": 0.39605569862049816, "learning_rate": 1.4518175777625022e-06, "loss": 0.0407, "step": 11308 }, { "epoch": 0.8804632377986472, "grad_norm": 0.38699866334559374, "learning_rate": 1.4499509779581078e-06, "loss": 0.0338, "step": 11309 }, { "epoch": 0.8805410928908569, "grad_norm": 0.5094222275921649, "learning_rate": 1.4480855337345733e-06, "loss": 0.0442, "step": 11310 }, { "epoch": 0.8806189479830665, "grad_norm": 0.39582492502509165, "learning_rate": 1.4462212452081104e-06, "loss": 0.03, "step": 11311 }, { "epoch": 0.8806968030752761, "grad_norm": 0.5951416204494097, "learning_rate": 1.4443581124948502e-06, "loss": 0.0573, "step": 11312 }, { "epoch": 0.8807746581674858, "grad_norm": 0.4427906740411047, "learning_rate": 1.4424961357108625e-06, "loss": 0.0284, "step": 11313 }, { "epoch": 0.8808525132596954, "grad_norm": 0.4394743028849236, "learning_rate": 1.4406353149721408e-06, "loss": 0.0373, "step": 11314 }, { "epoch": 0.880930368351905, "grad_norm": 0.6011649098190948, "learning_rate": 1.4387756503945993e-06, "loss": 0.0752, "step": 11315 }, { "epoch": 0.8810082234441147, "grad_norm": 0.40153676539974303, "learning_rate": 1.4369171420940898e-06, "loss": 0.0298, "step": 11316 }, { "epoch": 0.8810860785363243, "grad_norm": 0.368731371688386, "learning_rate": 1.435059790186384e-06, "loss": 0.0372, "step": 11317 }, { "epoch": 0.8811639336285338, "grad_norm": 0.48585179465171296, "learning_rate": 1.4332035947871937e-06, "loss": 0.0444, "step": 11318 }, { "epoch": 0.8812417887207435, "grad_norm": 0.3841115115268381, "learning_rate": 1.4313485560121487e-06, "loss": 0.0311, "step": 11319 }, { "epoch": 0.8813196438129531, "grad_norm": 0.4383350779815229, "learning_rate": 1.4294946739768078e-06, "loss": 0.0436, "step": 11320 }, { "epoch": 0.8813974989051627, "grad_norm": 0.3441867009119746, "learning_rate": 1.4276419487966586e-06, "loss": 0.0196, "step": 11321 }, { "epoch": 0.8814753539973724, "grad_norm": 0.4283605168592138, "learning_rate": 1.4257903805871154e-06, "loss": 0.0444, "step": 11322 }, { "epoch": 0.881553209089582, "grad_norm": 0.522338937892069, "learning_rate": 1.4239399694635325e-06, "loss": 0.0529, "step": 11323 }, { "epoch": 0.8816310641817916, "grad_norm": 0.5364934495380003, "learning_rate": 1.4220907155411713e-06, "loss": 0.0543, "step": 11324 }, { "epoch": 0.8817089192740013, "grad_norm": 0.558720992400805, "learning_rate": 1.4202426189352347e-06, "loss": 0.0543, "step": 11325 }, { "epoch": 0.8817867743662109, "grad_norm": 0.4658145938460827, "learning_rate": 1.418395679760849e-06, "loss": 0.0446, "step": 11326 }, { "epoch": 0.8818646294584205, "grad_norm": 0.5690713912428333, "learning_rate": 1.4165498981330706e-06, "loss": 0.0472, "step": 11327 }, { "epoch": 0.8819424845506302, "grad_norm": 0.3929212451677227, "learning_rate": 1.4147052741668831e-06, "loss": 0.0316, "step": 11328 }, { "epoch": 0.8820203396428398, "grad_norm": 0.40219805422922217, "learning_rate": 1.4128618079771994e-06, "loss": 0.0369, "step": 11329 }, { "epoch": 0.8820981947350494, "grad_norm": 0.4158878759561703, "learning_rate": 1.4110194996788584e-06, "loss": 0.0364, "step": 11330 }, { "epoch": 0.8821760498272591, "grad_norm": 0.44294995113634683, "learning_rate": 1.4091783493866262e-06, "loss": 0.0432, "step": 11331 }, { "epoch": 0.8822539049194686, "grad_norm": 0.26576747769018955, "learning_rate": 1.4073383572152e-06, "loss": 0.0185, "step": 11332 }, { "epoch": 0.8823317600116782, "grad_norm": 0.46521279175916513, "learning_rate": 1.405499523279199e-06, "loss": 0.0426, "step": 11333 }, { "epoch": 0.8824096151038879, "grad_norm": 0.46625995729522257, "learning_rate": 1.4036618476931762e-06, "loss": 0.043, "step": 11334 }, { "epoch": 0.8824874701960975, "grad_norm": 0.3281009258136172, "learning_rate": 1.4018253305716111e-06, "loss": 0.0265, "step": 11335 }, { "epoch": 0.8825653252883071, "grad_norm": 0.3807144465180845, "learning_rate": 1.399989972028908e-06, "loss": 0.0311, "step": 11336 }, { "epoch": 0.8826431803805168, "grad_norm": 0.472544152694454, "learning_rate": 1.3981557721794015e-06, "loss": 0.0498, "step": 11337 }, { "epoch": 0.8827210354727264, "grad_norm": 0.5132626285055603, "learning_rate": 1.3963227311373494e-06, "loss": 0.0523, "step": 11338 }, { "epoch": 0.882798890564936, "grad_norm": 0.3995782691529558, "learning_rate": 1.3944908490169518e-06, "loss": 0.0316, "step": 11339 }, { "epoch": 0.8828767456571457, "grad_norm": 0.5726366431242063, "learning_rate": 1.3926601259323191e-06, "loss": 0.073, "step": 11340 }, { "epoch": 0.8829546007493553, "grad_norm": 0.5035866877101751, "learning_rate": 1.3908305619975004e-06, "loss": 0.0593, "step": 11341 }, { "epoch": 0.8830324558415649, "grad_norm": 0.3099000550160958, "learning_rate": 1.3890021573264645e-06, "loss": 0.0171, "step": 11342 }, { "epoch": 0.8831103109337746, "grad_norm": 0.49732481506488685, "learning_rate": 1.3871749120331157e-06, "loss": 0.0539, "step": 11343 }, { "epoch": 0.8831881660259842, "grad_norm": 0.3690619542283914, "learning_rate": 1.3853488262312808e-06, "loss": 0.0296, "step": 11344 }, { "epoch": 0.8832660211181937, "grad_norm": 0.3764795861004298, "learning_rate": 1.38352390003472e-06, "loss": 0.0287, "step": 11345 }, { "epoch": 0.8833438762104033, "grad_norm": 0.4858894184917364, "learning_rate": 1.3817001335571135e-06, "loss": 0.0522, "step": 11346 }, { "epoch": 0.883421731302613, "grad_norm": 0.4860674967388264, "learning_rate": 1.3798775269120746e-06, "loss": 0.0548, "step": 11347 }, { "epoch": 0.8834995863948226, "grad_norm": 0.3268922885841208, "learning_rate": 1.3780560802131416e-06, "loss": 0.0221, "step": 11348 }, { "epoch": 0.8835774414870322, "grad_norm": 0.40292407781184264, "learning_rate": 1.3762357935737858e-06, "loss": 0.0406, "step": 11349 }, { "epoch": 0.8836552965792419, "grad_norm": 0.3884796894074064, "learning_rate": 1.3744166671074012e-06, "loss": 0.037, "step": 11350 }, { "epoch": 0.8836552965792419, "eval_loss": 0.005090781487524509, "eval_runtime": 162.4185, "eval_samples_per_second": 17.732, "eval_steps_per_second": 0.634, "step": 11350 }, { "epoch": 0.8837331516714515, "grad_norm": 0.33947258758438215, "learning_rate": 1.37259870092731e-06, "loss": 0.0271, "step": 11351 }, { "epoch": 0.8838110067636611, "grad_norm": 0.6562372328132209, "learning_rate": 1.3707818951467645e-06, "loss": 0.0837, "step": 11352 }, { "epoch": 0.8838888618558708, "grad_norm": 0.4673443566203426, "learning_rate": 1.3689662498789403e-06, "loss": 0.0442, "step": 11353 }, { "epoch": 0.8839667169480804, "grad_norm": 0.43462145243246575, "learning_rate": 1.3671517652369448e-06, "loss": 0.0316, "step": 11354 }, { "epoch": 0.88404457204029, "grad_norm": 0.37069371072530505, "learning_rate": 1.3653384413338122e-06, "loss": 0.0244, "step": 11355 }, { "epoch": 0.8841224271324997, "grad_norm": 0.38581212696445816, "learning_rate": 1.3635262782825054e-06, "loss": 0.0318, "step": 11356 }, { "epoch": 0.8842002822247093, "grad_norm": 0.4783558326985288, "learning_rate": 1.361715276195912e-06, "loss": 0.0463, "step": 11357 }, { "epoch": 0.8842781373169188, "grad_norm": 0.4678415877139958, "learning_rate": 1.3599054351868457e-06, "loss": 0.0459, "step": 11358 }, { "epoch": 0.8843559924091285, "grad_norm": 0.3675238921900079, "learning_rate": 1.358096755368059e-06, "loss": 0.0348, "step": 11359 }, { "epoch": 0.8844338475013381, "grad_norm": 0.4496640502427338, "learning_rate": 1.3562892368522174e-06, "loss": 0.0407, "step": 11360 }, { "epoch": 0.8845117025935477, "grad_norm": 0.4479655682127934, "learning_rate": 1.3544828797519238e-06, "loss": 0.0399, "step": 11361 }, { "epoch": 0.8845895576857574, "grad_norm": 0.3632369915405234, "learning_rate": 1.3526776841797107e-06, "loss": 0.024, "step": 11362 }, { "epoch": 0.884667412777967, "grad_norm": 0.44286896805056286, "learning_rate": 1.3508736502480214e-06, "loss": 0.0395, "step": 11363 }, { "epoch": 0.8847452678701766, "grad_norm": 0.4531450279134641, "learning_rate": 1.3490707780692437e-06, "loss": 0.0394, "step": 11364 }, { "epoch": 0.8848231229623863, "grad_norm": 0.468622614305925, "learning_rate": 1.347269067755692e-06, "loss": 0.0315, "step": 11365 }, { "epoch": 0.8849009780545959, "grad_norm": 0.36819861649384916, "learning_rate": 1.3454685194196016e-06, "loss": 0.0236, "step": 11366 }, { "epoch": 0.8849788331468055, "grad_norm": 0.5960152142624502, "learning_rate": 1.34366913317314e-06, "loss": 0.0707, "step": 11367 }, { "epoch": 0.8850566882390152, "grad_norm": 0.3601843966501285, "learning_rate": 1.3418709091283978e-06, "loss": 0.0248, "step": 11368 }, { "epoch": 0.8851345433312248, "grad_norm": 0.45585150246816447, "learning_rate": 1.3400738473973918e-06, "loss": 0.0476, "step": 11369 }, { "epoch": 0.8852123984234344, "grad_norm": 0.3886774337528988, "learning_rate": 1.3382779480920816e-06, "loss": 0.0301, "step": 11370 }, { "epoch": 0.885290253515644, "grad_norm": 0.8746635420519069, "learning_rate": 1.3364832113243397e-06, "loss": 0.0846, "step": 11371 }, { "epoch": 0.8853681086078536, "grad_norm": 0.43131473057191616, "learning_rate": 1.3346896372059636e-06, "loss": 0.0406, "step": 11372 }, { "epoch": 0.8854459637000632, "grad_norm": 0.4222279137408329, "learning_rate": 1.3328972258486906e-06, "loss": 0.0291, "step": 11373 }, { "epoch": 0.8855238187922729, "grad_norm": 0.4254993346095062, "learning_rate": 1.331105977364171e-06, "loss": 0.029, "step": 11374 }, { "epoch": 0.8856016738844825, "grad_norm": 0.3489381144067433, "learning_rate": 1.3293158918640025e-06, "loss": 0.0215, "step": 11375 }, { "epoch": 0.8856795289766921, "grad_norm": 0.4288306784750984, "learning_rate": 1.3275269694596938e-06, "loss": 0.0328, "step": 11376 }, { "epoch": 0.8857573840689018, "grad_norm": 0.37460806263076935, "learning_rate": 1.3257392102626865e-06, "loss": 0.0284, "step": 11377 }, { "epoch": 0.8858352391611114, "grad_norm": 0.39778125304514367, "learning_rate": 1.3239526143843495e-06, "loss": 0.0308, "step": 11378 }, { "epoch": 0.885913094253321, "grad_norm": 0.4264136659868211, "learning_rate": 1.322167181935976e-06, "loss": 0.0402, "step": 11379 }, { "epoch": 0.8859909493455306, "grad_norm": 0.4383668479728877, "learning_rate": 1.3203829130288015e-06, "loss": 0.0414, "step": 11380 }, { "epoch": 0.8860688044377403, "grad_norm": 0.46507950602581305, "learning_rate": 1.3185998077739637e-06, "loss": 0.0403, "step": 11381 }, { "epoch": 0.8861466595299499, "grad_norm": 0.49591041745729936, "learning_rate": 1.3168178662825493e-06, "loss": 0.0358, "step": 11382 }, { "epoch": 0.8862245146221595, "grad_norm": 0.4971933142165385, "learning_rate": 1.3150370886655629e-06, "loss": 0.0553, "step": 11383 }, { "epoch": 0.8863023697143692, "grad_norm": 0.4605436971469385, "learning_rate": 1.3132574750339355e-06, "loss": 0.0337, "step": 11384 }, { "epoch": 0.8863802248065787, "grad_norm": 0.4401794634499213, "learning_rate": 1.3114790254985343e-06, "loss": 0.0356, "step": 11385 }, { "epoch": 0.8864580798987883, "grad_norm": 0.4382812761996334, "learning_rate": 1.3097017401701485e-06, "loss": 0.0282, "step": 11386 }, { "epoch": 0.886535934990998, "grad_norm": 0.531057895811409, "learning_rate": 1.3079256191594892e-06, "loss": 0.0586, "step": 11387 }, { "epoch": 0.8866137900832076, "grad_norm": 0.3861488851407509, "learning_rate": 1.3061506625772058e-06, "loss": 0.0311, "step": 11388 }, { "epoch": 0.8866916451754172, "grad_norm": 0.42928754001035124, "learning_rate": 1.3043768705338655e-06, "loss": 0.0383, "step": 11389 }, { "epoch": 0.8867695002676269, "grad_norm": 0.4613316773486423, "learning_rate": 1.3026042431399689e-06, "loss": 0.0323, "step": 11390 }, { "epoch": 0.8868473553598365, "grad_norm": 0.39927784483439493, "learning_rate": 1.300832780505943e-06, "loss": 0.0309, "step": 11391 }, { "epoch": 0.8869252104520461, "grad_norm": 0.4439838789633012, "learning_rate": 1.2990624827421417e-06, "loss": 0.0396, "step": 11392 }, { "epoch": 0.8870030655442558, "grad_norm": 0.4407898218803142, "learning_rate": 1.2972933499588436e-06, "loss": 0.0371, "step": 11393 }, { "epoch": 0.8870809206364654, "grad_norm": 0.3907951828053832, "learning_rate": 1.2955253822662583e-06, "loss": 0.035, "step": 11394 }, { "epoch": 0.887158775728675, "grad_norm": 0.30767191172570457, "learning_rate": 1.2937585797745244e-06, "loss": 0.0221, "step": 11395 }, { "epoch": 0.8872366308208847, "grad_norm": 0.5791994226181162, "learning_rate": 1.291992942593705e-06, "loss": 0.0505, "step": 11396 }, { "epoch": 0.8873144859130943, "grad_norm": 0.5506348455405053, "learning_rate": 1.2902284708337897e-06, "loss": 0.0662, "step": 11397 }, { "epoch": 0.8873923410053038, "grad_norm": 0.5036965345546951, "learning_rate": 1.2884651646046975e-06, "loss": 0.0395, "step": 11398 }, { "epoch": 0.8874701960975135, "grad_norm": 0.3845879718934902, "learning_rate": 1.2867030240162714e-06, "loss": 0.0343, "step": 11399 }, { "epoch": 0.8875480511897231, "grad_norm": 0.39628430796100506, "learning_rate": 1.2849420491782883e-06, "loss": 0.0339, "step": 11400 }, { "epoch": 0.8875480511897231, "eval_loss": 0.005068584345281124, "eval_runtime": 162.6506, "eval_samples_per_second": 17.707, "eval_steps_per_second": 0.633, "step": 11400 }, { "epoch": 0.8876259062819327, "grad_norm": 0.49572457065300185, "learning_rate": 1.2831822402004468e-06, "loss": 0.0601, "step": 11401 }, { "epoch": 0.8877037613741424, "grad_norm": 0.3692590424040073, "learning_rate": 1.281423597192375e-06, "loss": 0.0325, "step": 11402 }, { "epoch": 0.887781616466352, "grad_norm": 0.3843742646219767, "learning_rate": 1.2796661202636296e-06, "loss": 0.0437, "step": 11403 }, { "epoch": 0.8878594715585616, "grad_norm": 0.46829978207668915, "learning_rate": 1.2779098095236897e-06, "loss": 0.039, "step": 11404 }, { "epoch": 0.8879373266507713, "grad_norm": 0.45280855568165707, "learning_rate": 1.2761546650819635e-06, "loss": 0.0393, "step": 11405 }, { "epoch": 0.8880151817429809, "grad_norm": 0.34214222718045434, "learning_rate": 1.2744006870477943e-06, "loss": 0.0208, "step": 11406 }, { "epoch": 0.8880930368351905, "grad_norm": 0.30136468239912656, "learning_rate": 1.2726478755304461e-06, "loss": 0.0216, "step": 11407 }, { "epoch": 0.8881708919274002, "grad_norm": 0.4532977000284785, "learning_rate": 1.2708962306391093e-06, "loss": 0.0418, "step": 11408 }, { "epoch": 0.8882487470196098, "grad_norm": 0.5435793824815465, "learning_rate": 1.269145752482901e-06, "loss": 0.0652, "step": 11409 }, { "epoch": 0.8883266021118194, "grad_norm": 0.35587323860652464, "learning_rate": 1.2673964411708695e-06, "loss": 0.0309, "step": 11410 }, { "epoch": 0.888404457204029, "grad_norm": 0.3687757722825429, "learning_rate": 1.2656482968119898e-06, "loss": 0.0278, "step": 11411 }, { "epoch": 0.8884823122962386, "grad_norm": 0.4361221699620528, "learning_rate": 1.2639013195151617e-06, "loss": 0.0485, "step": 11412 }, { "epoch": 0.8885601673884482, "grad_norm": 0.4919274642219566, "learning_rate": 1.2621555093892157e-06, "loss": 0.0441, "step": 11413 }, { "epoch": 0.8886380224806579, "grad_norm": 0.37379820844364287, "learning_rate": 1.2604108665429027e-06, "loss": 0.0292, "step": 11414 }, { "epoch": 0.8887158775728675, "grad_norm": 0.3617962175919362, "learning_rate": 1.2586673910849067e-06, "loss": 0.0293, "step": 11415 }, { "epoch": 0.8887937326650771, "grad_norm": 0.3602102703800251, "learning_rate": 1.2569250831238456e-06, "loss": 0.0281, "step": 11416 }, { "epoch": 0.8888715877572867, "grad_norm": 0.3616431549418618, "learning_rate": 1.2551839427682477e-06, "loss": 0.0263, "step": 11417 }, { "epoch": 0.8889494428494964, "grad_norm": 0.4418254179812698, "learning_rate": 1.2534439701265844e-06, "loss": 0.0478, "step": 11418 }, { "epoch": 0.889027297941706, "grad_norm": 0.3946452854632738, "learning_rate": 1.2517051653072443e-06, "loss": 0.033, "step": 11419 }, { "epoch": 0.8891051530339156, "grad_norm": 0.433517419095666, "learning_rate": 1.2499675284185476e-06, "loss": 0.0256, "step": 11420 }, { "epoch": 0.8891830081261253, "grad_norm": 0.42466362987609124, "learning_rate": 1.2482310595687431e-06, "loss": 0.0362, "step": 11421 }, { "epoch": 0.8892608632183349, "grad_norm": 0.4785965496163524, "learning_rate": 1.2464957588659998e-06, "loss": 0.0449, "step": 11422 }, { "epoch": 0.8893387183105445, "grad_norm": 0.3146578663495818, "learning_rate": 1.2447616264184225e-06, "loss": 0.0265, "step": 11423 }, { "epoch": 0.8894165734027542, "grad_norm": 0.5766326539709197, "learning_rate": 1.2430286623340382e-06, "loss": 0.0547, "step": 11424 }, { "epoch": 0.8894944284949637, "grad_norm": 0.4061184373072967, "learning_rate": 1.241296866720798e-06, "loss": 0.0377, "step": 11425 }, { "epoch": 0.8895722835871733, "grad_norm": 0.5532876758974562, "learning_rate": 1.239566239686596e-06, "loss": 0.0584, "step": 11426 }, { "epoch": 0.889650138679383, "grad_norm": 0.45261464098189874, "learning_rate": 1.2378367813392322e-06, "loss": 0.0418, "step": 11427 }, { "epoch": 0.8897279937715926, "grad_norm": 0.4196328172075648, "learning_rate": 1.2361084917864497e-06, "loss": 0.0329, "step": 11428 }, { "epoch": 0.8898058488638022, "grad_norm": 0.5394599334237622, "learning_rate": 1.234381371135911e-06, "loss": 0.058, "step": 11429 }, { "epoch": 0.8898837039560119, "grad_norm": 0.38439056108583775, "learning_rate": 1.2326554194952011e-06, "loss": 0.0311, "step": 11430 }, { "epoch": 0.8899615590482215, "grad_norm": 0.4555204489393799, "learning_rate": 1.2309306369718476e-06, "loss": 0.0391, "step": 11431 }, { "epoch": 0.8900394141404311, "grad_norm": 0.5263242367612041, "learning_rate": 1.2292070236732933e-06, "loss": 0.0574, "step": 11432 }, { "epoch": 0.8901172692326408, "grad_norm": 0.4463832699883386, "learning_rate": 1.2274845797069102e-06, "loss": 0.0461, "step": 11433 }, { "epoch": 0.8901951243248504, "grad_norm": 0.5182249457261788, "learning_rate": 1.225763305179999e-06, "loss": 0.0526, "step": 11434 }, { "epoch": 0.89027297941706, "grad_norm": 0.45727506541403956, "learning_rate": 1.2240432001997849e-06, "loss": 0.0382, "step": 11435 }, { "epoch": 0.8903508345092697, "grad_norm": 0.4361958309695378, "learning_rate": 1.2223242648734268e-06, "loss": 0.0439, "step": 11436 }, { "epoch": 0.8904286896014793, "grad_norm": 0.4429331425037572, "learning_rate": 1.2206064993080036e-06, "loss": 0.031, "step": 11437 }, { "epoch": 0.8905065446936888, "grad_norm": 0.43251942824295486, "learning_rate": 1.2188899036105295e-06, "loss": 0.0375, "step": 11438 }, { "epoch": 0.8905843997858985, "grad_norm": 0.421240826026806, "learning_rate": 1.2171744778879324e-06, "loss": 0.0383, "step": 11439 }, { "epoch": 0.8906622548781081, "grad_norm": 0.30527883815525547, "learning_rate": 1.2154602222470757e-06, "loss": 0.0164, "step": 11440 }, { "epoch": 0.8907401099703177, "grad_norm": 0.38078304147385605, "learning_rate": 1.2137471367947518e-06, "loss": 0.0266, "step": 11441 }, { "epoch": 0.8908179650625274, "grad_norm": 0.46849963019892316, "learning_rate": 1.2120352216376796e-06, "loss": 0.0507, "step": 11442 }, { "epoch": 0.890895820154737, "grad_norm": 0.4022019993490177, "learning_rate": 1.2103244768825028e-06, "loss": 0.0378, "step": 11443 }, { "epoch": 0.8909736752469466, "grad_norm": 0.422256646664387, "learning_rate": 1.2086149026357897e-06, "loss": 0.0367, "step": 11444 }, { "epoch": 0.8910515303391563, "grad_norm": 0.5696602626209237, "learning_rate": 1.2069064990040435e-06, "loss": 0.0537, "step": 11445 }, { "epoch": 0.8911293854313659, "grad_norm": 0.37286603146704517, "learning_rate": 1.2051992660936818e-06, "loss": 0.0294, "step": 11446 }, { "epoch": 0.8912072405235755, "grad_norm": 0.43281719044423844, "learning_rate": 1.2034932040110702e-06, "loss": 0.0442, "step": 11447 }, { "epoch": 0.8912850956157852, "grad_norm": 0.4091561309513955, "learning_rate": 1.2017883128624774e-06, "loss": 0.0335, "step": 11448 }, { "epoch": 0.8913629507079948, "grad_norm": 0.40558551509349355, "learning_rate": 1.2000845927541138e-06, "loss": 0.0354, "step": 11449 }, { "epoch": 0.8914408058002044, "grad_norm": 0.44162001670838535, "learning_rate": 1.19838204379211e-06, "loss": 0.0374, "step": 11450 }, { "epoch": 0.8914408058002044, "eval_loss": 0.005023773293942213, "eval_runtime": 162.6943, "eval_samples_per_second": 17.702, "eval_steps_per_second": 0.633, "step": 11450 }, { "epoch": 0.891518660892414, "grad_norm": 0.33986437827101645, "learning_rate": 1.1966806660825303e-06, "loss": 0.0309, "step": 11451 }, { "epoch": 0.8915965159846236, "grad_norm": 0.26303019135077826, "learning_rate": 1.1949804597313608e-06, "loss": 0.0116, "step": 11452 }, { "epoch": 0.8916743710768332, "grad_norm": 0.5609619360452653, "learning_rate": 1.193281424844519e-06, "loss": 0.0639, "step": 11453 }, { "epoch": 0.8917522261690428, "grad_norm": 0.47325041014674907, "learning_rate": 1.1915835615278471e-06, "loss": 0.0361, "step": 11454 }, { "epoch": 0.8918300812612525, "grad_norm": 0.48486817927798226, "learning_rate": 1.1898868698871092e-06, "loss": 0.0539, "step": 11455 }, { "epoch": 0.8919079363534621, "grad_norm": 0.46395058349999524, "learning_rate": 1.1881913500280051e-06, "loss": 0.0382, "step": 11456 }, { "epoch": 0.8919857914456717, "grad_norm": 0.5590157457096752, "learning_rate": 1.186497002056155e-06, "loss": 0.0542, "step": 11457 }, { "epoch": 0.8920636465378814, "grad_norm": 0.32875877896952416, "learning_rate": 1.1848038260771122e-06, "loss": 0.0183, "step": 11458 }, { "epoch": 0.892141501630091, "grad_norm": 0.41489637227309384, "learning_rate": 1.1831118221963522e-06, "loss": 0.0347, "step": 11459 }, { "epoch": 0.8922193567223006, "grad_norm": 0.43431434590097484, "learning_rate": 1.1814209905192775e-06, "loss": 0.0377, "step": 11460 }, { "epoch": 0.8922972118145103, "grad_norm": 0.4582260288104876, "learning_rate": 1.1797313311512171e-06, "loss": 0.043, "step": 11461 }, { "epoch": 0.8923750669067199, "grad_norm": 0.5128360922916808, "learning_rate": 1.1780428441974334e-06, "loss": 0.0577, "step": 11462 }, { "epoch": 0.8924529219989294, "grad_norm": 0.47785230290223335, "learning_rate": 1.1763555297631113e-06, "loss": 0.051, "step": 11463 }, { "epoch": 0.8925307770911391, "grad_norm": 0.45456213950146535, "learning_rate": 1.17466938795336e-06, "loss": 0.0399, "step": 11464 }, { "epoch": 0.8926086321833487, "grad_norm": 0.37907750159360964, "learning_rate": 1.1729844188732199e-06, "loss": 0.0273, "step": 11465 }, { "epoch": 0.8926864872755583, "grad_norm": 0.45032230740465373, "learning_rate": 1.1713006226276536e-06, "loss": 0.0442, "step": 11466 }, { "epoch": 0.892764342367768, "grad_norm": 0.4198102358234348, "learning_rate": 1.1696179993215574e-06, "loss": 0.035, "step": 11467 }, { "epoch": 0.8928421974599776, "grad_norm": 0.49259167031426715, "learning_rate": 1.1679365490597495e-06, "loss": 0.0415, "step": 11468 }, { "epoch": 0.8929200525521872, "grad_norm": 0.3501791860783724, "learning_rate": 1.166256271946975e-06, "loss": 0.0224, "step": 11469 }, { "epoch": 0.8929979076443969, "grad_norm": 0.38828204990370674, "learning_rate": 1.164577168087908e-06, "loss": 0.0347, "step": 11470 }, { "epoch": 0.8930757627366065, "grad_norm": 0.4578611342952572, "learning_rate": 1.1628992375871474e-06, "loss": 0.0374, "step": 11471 }, { "epoch": 0.8931536178288161, "grad_norm": 0.445197457444716, "learning_rate": 1.1612224805492244e-06, "loss": 0.0344, "step": 11472 }, { "epoch": 0.8932314729210258, "grad_norm": 0.3593038601834151, "learning_rate": 1.1595468970785918e-06, "loss": 0.0238, "step": 11473 }, { "epoch": 0.8933093280132354, "grad_norm": 0.5718377471033801, "learning_rate": 1.15787248727963e-06, "loss": 0.0595, "step": 11474 }, { "epoch": 0.893387183105445, "grad_norm": 0.4866739155513266, "learning_rate": 1.1561992512566444e-06, "loss": 0.0437, "step": 11475 }, { "epoch": 0.8934650381976547, "grad_norm": 0.43253861880186795, "learning_rate": 1.1545271891138721e-06, "loss": 0.0358, "step": 11476 }, { "epoch": 0.8935428932898642, "grad_norm": 0.3351795537935047, "learning_rate": 1.1528563009554761e-06, "loss": 0.0199, "step": 11477 }, { "epoch": 0.8936207483820738, "grad_norm": 0.3861156839269693, "learning_rate": 1.1511865868855422e-06, "loss": 0.0298, "step": 11478 }, { "epoch": 0.8936986034742835, "grad_norm": 0.44121492808701634, "learning_rate": 1.1495180470080846e-06, "loss": 0.0333, "step": 11479 }, { "epoch": 0.8937764585664931, "grad_norm": 0.5285356387864651, "learning_rate": 1.1478506814270496e-06, "loss": 0.047, "step": 11480 }, { "epoch": 0.8938543136587027, "grad_norm": 0.4358510973081521, "learning_rate": 1.1461844902463049e-06, "loss": 0.0396, "step": 11481 }, { "epoch": 0.8939321687509124, "grad_norm": 0.532285857125012, "learning_rate": 1.1445194735696407e-06, "loss": 0.0551, "step": 11482 }, { "epoch": 0.894010023843122, "grad_norm": 0.5598379544148983, "learning_rate": 1.14285563150079e-06, "loss": 0.0667, "step": 11483 }, { "epoch": 0.8940878789353316, "grad_norm": 0.49387297243432154, "learning_rate": 1.1411929641433962e-06, "loss": 0.0408, "step": 11484 }, { "epoch": 0.8941657340275413, "grad_norm": 0.41752850952947784, "learning_rate": 1.1395314716010363e-06, "loss": 0.0332, "step": 11485 }, { "epoch": 0.8942435891197509, "grad_norm": 0.43371620622002005, "learning_rate": 1.1378711539772124e-06, "loss": 0.0264, "step": 11486 }, { "epoch": 0.8943214442119605, "grad_norm": 0.39810238053912406, "learning_rate": 1.1362120113753572e-06, "loss": 0.0272, "step": 11487 }, { "epoch": 0.8943992993041701, "grad_norm": 0.33326157043585336, "learning_rate": 1.1345540438988256e-06, "loss": 0.0195, "step": 11488 }, { "epoch": 0.8944771543963798, "grad_norm": 0.4088556775629984, "learning_rate": 1.132897251650902e-06, "loss": 0.0308, "step": 11489 }, { "epoch": 0.8945550094885893, "grad_norm": 0.529270728952028, "learning_rate": 1.1312416347347965e-06, "loss": 0.0593, "step": 11490 }, { "epoch": 0.8946328645807989, "grad_norm": 0.41227514618438843, "learning_rate": 1.129587193253645e-06, "loss": 0.0345, "step": 11491 }, { "epoch": 0.8947107196730086, "grad_norm": 0.287120764163676, "learning_rate": 1.1279339273105117e-06, "loss": 0.0182, "step": 11492 }, { "epoch": 0.8947885747652182, "grad_norm": 0.529786907130051, "learning_rate": 1.1262818370083894e-06, "loss": 0.0533, "step": 11493 }, { "epoch": 0.8948664298574278, "grad_norm": 0.4836981489197424, "learning_rate": 1.124630922450196e-06, "loss": 0.046, "step": 11494 }, { "epoch": 0.8949442849496375, "grad_norm": 0.36388389062092313, "learning_rate": 1.1229811837387717e-06, "loss": 0.0205, "step": 11495 }, { "epoch": 0.8950221400418471, "grad_norm": 0.4946944674016834, "learning_rate": 1.1213326209768938e-06, "loss": 0.0478, "step": 11496 }, { "epoch": 0.8950999951340567, "grad_norm": 0.3526852995361556, "learning_rate": 1.1196852342672515e-06, "loss": 0.023, "step": 11497 }, { "epoch": 0.8951778502262664, "grad_norm": 0.4779193696054476, "learning_rate": 1.1180390237124762e-06, "loss": 0.0461, "step": 11498 }, { "epoch": 0.895255705318476, "grad_norm": 0.4266251064277774, "learning_rate": 1.1163939894151142e-06, "loss": 0.0307, "step": 11499 }, { "epoch": 0.8953335604106856, "grad_norm": 0.4339299340790119, "learning_rate": 1.1147501314776488e-06, "loss": 0.0332, "step": 11500 }, { "epoch": 0.8953335604106856, "eval_loss": 0.005003530532121658, "eval_runtime": 162.4804, "eval_samples_per_second": 17.725, "eval_steps_per_second": 0.634, "step": 11500 }, { "epoch": 0.8954114155028953, "grad_norm": 0.37261038093755605, "learning_rate": 1.113107450002482e-06, "loss": 0.0299, "step": 11501 }, { "epoch": 0.8954892705951049, "grad_norm": 0.5696740744795372, "learning_rate": 1.1114659450919385e-06, "loss": 0.0661, "step": 11502 }, { "epoch": 0.8955671256873144, "grad_norm": 0.47886339295458047, "learning_rate": 1.109825616848288e-06, "loss": 0.0411, "step": 11503 }, { "epoch": 0.8956449807795241, "grad_norm": 0.5440824220763812, "learning_rate": 1.1081864653737107e-06, "loss": 0.0589, "step": 11504 }, { "epoch": 0.8957228358717337, "grad_norm": 0.5024766303183109, "learning_rate": 1.1065484907703183e-06, "loss": 0.0476, "step": 11505 }, { "epoch": 0.8958006909639433, "grad_norm": 0.5794043386057549, "learning_rate": 1.104911693140145e-06, "loss": 0.0595, "step": 11506 }, { "epoch": 0.895878546056153, "grad_norm": 0.34745379384304076, "learning_rate": 1.1032760725851576e-06, "loss": 0.0267, "step": 11507 }, { "epoch": 0.8959564011483626, "grad_norm": 0.4495373312754213, "learning_rate": 1.1016416292072485e-06, "loss": 0.0337, "step": 11508 }, { "epoch": 0.8960342562405722, "grad_norm": 0.35828495390677223, "learning_rate": 1.1000083631082381e-06, "loss": 0.0264, "step": 11509 }, { "epoch": 0.8961121113327819, "grad_norm": 0.3084166395815972, "learning_rate": 1.0983762743898673e-06, "loss": 0.0179, "step": 11510 }, { "epoch": 0.8961899664249915, "grad_norm": 0.4659919528240046, "learning_rate": 1.0967453631538105e-06, "loss": 0.0407, "step": 11511 }, { "epoch": 0.8962678215172011, "grad_norm": 0.341419310185516, "learning_rate": 1.0951156295016618e-06, "loss": 0.0243, "step": 11512 }, { "epoch": 0.8963456766094108, "grad_norm": 0.4122637748979694, "learning_rate": 1.093487073534949e-06, "loss": 0.0387, "step": 11513 }, { "epoch": 0.8964235317016204, "grad_norm": 0.41695620767116087, "learning_rate": 1.0918596953551286e-06, "loss": 0.0347, "step": 11514 }, { "epoch": 0.89650138679383, "grad_norm": 0.3868664759860722, "learning_rate": 1.0902334950635684e-06, "loss": 0.0263, "step": 11515 }, { "epoch": 0.8965792418860397, "grad_norm": 0.4368395555626008, "learning_rate": 1.0886084727615786e-06, "loss": 0.0411, "step": 11516 }, { "epoch": 0.8966570969782492, "grad_norm": 0.3847514650264617, "learning_rate": 1.0869846285503894e-06, "loss": 0.0286, "step": 11517 }, { "epoch": 0.8967349520704588, "grad_norm": 0.33193260353107024, "learning_rate": 1.0853619625311552e-06, "loss": 0.024, "step": 11518 }, { "epoch": 0.8968128071626685, "grad_norm": 0.41126428280686134, "learning_rate": 1.0837404748049684e-06, "loss": 0.033, "step": 11519 }, { "epoch": 0.8968906622548781, "grad_norm": 0.2789515869010704, "learning_rate": 1.0821201654728354e-06, "loss": 0.0182, "step": 11520 }, { "epoch": 0.8969685173470877, "grad_norm": 0.5094482606735989, "learning_rate": 1.0805010346356925e-06, "loss": 0.039, "step": 11521 }, { "epoch": 0.8970463724392974, "grad_norm": 0.5200380716262433, "learning_rate": 1.0788830823944086e-06, "loss": 0.0526, "step": 11522 }, { "epoch": 0.897124227531507, "grad_norm": 0.476100953301501, "learning_rate": 1.0772663088497693e-06, "loss": 0.0443, "step": 11523 }, { "epoch": 0.8972020826237166, "grad_norm": 0.441600510703099, "learning_rate": 1.075650714102494e-06, "loss": 0.0396, "step": 11524 }, { "epoch": 0.8972799377159262, "grad_norm": 0.35347629867589997, "learning_rate": 1.0740362982532294e-06, "loss": 0.0291, "step": 11525 }, { "epoch": 0.8973577928081359, "grad_norm": 0.4048754226033282, "learning_rate": 1.072423061402541e-06, "loss": 0.0289, "step": 11526 }, { "epoch": 0.8974356479003455, "grad_norm": 0.39202379843593665, "learning_rate": 1.070811003650929e-06, "loss": 0.0391, "step": 11527 }, { "epoch": 0.8975135029925551, "grad_norm": 0.3145837617717731, "learning_rate": 1.0692001250988149e-06, "loss": 0.0189, "step": 11528 }, { "epoch": 0.8975913580847648, "grad_norm": 0.5415770788703542, "learning_rate": 1.0675904258465519e-06, "loss": 0.0618, "step": 11529 }, { "epoch": 0.8976692131769743, "grad_norm": 0.4242788118894868, "learning_rate": 1.0659819059944154e-06, "loss": 0.0345, "step": 11530 }, { "epoch": 0.8977470682691839, "grad_norm": 0.5087598335057154, "learning_rate": 1.0643745656426074e-06, "loss": 0.0447, "step": 11531 }, { "epoch": 0.8978249233613936, "grad_norm": 0.5131950012064578, "learning_rate": 1.0627684048912612e-06, "loss": 0.0459, "step": 11532 }, { "epoch": 0.8979027784536032, "grad_norm": 0.4248543113468323, "learning_rate": 1.0611634238404279e-06, "loss": 0.037, "step": 11533 }, { "epoch": 0.8979806335458128, "grad_norm": 0.40057577212717843, "learning_rate": 1.059559622590094e-06, "loss": 0.0312, "step": 11534 }, { "epoch": 0.8980584886380225, "grad_norm": 0.45485306554155713, "learning_rate": 1.0579570012401686e-06, "loss": 0.0414, "step": 11535 }, { "epoch": 0.8981363437302321, "grad_norm": 0.4811511960005352, "learning_rate": 1.056355559890485e-06, "loss": 0.0393, "step": 11536 }, { "epoch": 0.8982141988224417, "grad_norm": 0.2640961453748578, "learning_rate": 1.0547552986408083e-06, "loss": 0.015, "step": 11537 }, { "epoch": 0.8982920539146514, "grad_norm": 0.6674121981467424, "learning_rate": 1.0531562175908227e-06, "loss": 0.0783, "step": 11538 }, { "epoch": 0.898369909006861, "grad_norm": 0.41157324034917636, "learning_rate": 1.051558316840149e-06, "loss": 0.0329, "step": 11539 }, { "epoch": 0.8984477640990706, "grad_norm": 0.3209961151672863, "learning_rate": 1.049961596488327e-06, "loss": 0.0235, "step": 11540 }, { "epoch": 0.8985256191912803, "grad_norm": 0.608615450924158, "learning_rate": 1.0483660566348242e-06, "loss": 0.0733, "step": 11541 }, { "epoch": 0.8986034742834899, "grad_norm": 0.32406378682053655, "learning_rate": 1.0467716973790365e-06, "loss": 0.0177, "step": 11542 }, { "epoch": 0.8986813293756994, "grad_norm": 0.6154656821074445, "learning_rate": 1.0451785188202845e-06, "loss": 0.0703, "step": 11543 }, { "epoch": 0.8987591844679091, "grad_norm": 0.34426328584937615, "learning_rate": 1.0435865210578134e-06, "loss": 0.022, "step": 11544 }, { "epoch": 0.8988370395601187, "grad_norm": 0.4832512232914681, "learning_rate": 1.041995704190799e-06, "loss": 0.0443, "step": 11545 }, { "epoch": 0.8989148946523283, "grad_norm": 0.4632028188606285, "learning_rate": 1.0404060683183425e-06, "loss": 0.0387, "step": 11546 }, { "epoch": 0.898992749744538, "grad_norm": 0.4704838880257709, "learning_rate": 1.0388176135394712e-06, "loss": 0.0466, "step": 11547 }, { "epoch": 0.8990706048367476, "grad_norm": 0.4867426696907714, "learning_rate": 1.0372303399531325e-06, "loss": 0.0546, "step": 11548 }, { "epoch": 0.8991484599289572, "grad_norm": 0.45607512706254516, "learning_rate": 1.035644247658214e-06, "loss": 0.0396, "step": 11549 }, { "epoch": 0.8992263150211669, "grad_norm": 0.4509871157126055, "learning_rate": 1.034059336753519e-06, "loss": 0.0455, "step": 11550 }, { "epoch": 0.8992263150211669, "eval_loss": 0.00498028052970767, "eval_runtime": 162.5536, "eval_samples_per_second": 17.717, "eval_steps_per_second": 0.634, "step": 11550 }, { "epoch": 0.8993041701133765, "grad_norm": 0.4160652916545052, "learning_rate": 1.0324756073377796e-06, "loss": 0.0303, "step": 11551 }, { "epoch": 0.8993820252055861, "grad_norm": 0.5059389250729802, "learning_rate": 1.0308930595096545e-06, "loss": 0.0496, "step": 11552 }, { "epoch": 0.8994598802977958, "grad_norm": 0.43081317297944766, "learning_rate": 1.0293116933677273e-06, "loss": 0.0356, "step": 11553 }, { "epoch": 0.8995377353900054, "grad_norm": 0.27376136483631536, "learning_rate": 1.0277315090105144e-06, "loss": 0.0121, "step": 11554 }, { "epoch": 0.899615590482215, "grad_norm": 0.457485487648534, "learning_rate": 1.0261525065364485e-06, "loss": 0.0454, "step": 11555 }, { "epoch": 0.8996934455744247, "grad_norm": 0.3678160145850751, "learning_rate": 1.0245746860438977e-06, "loss": 0.0261, "step": 11556 }, { "epoch": 0.8997713006666342, "grad_norm": 0.4833734725750345, "learning_rate": 1.0229980476311518e-06, "loss": 0.0354, "step": 11557 }, { "epoch": 0.8998491557588438, "grad_norm": 0.6092758294363013, "learning_rate": 1.0214225913964281e-06, "loss": 0.066, "step": 11558 }, { "epoch": 0.8999270108510534, "grad_norm": 0.35641231355861336, "learning_rate": 1.0198483174378658e-06, "loss": 0.0217, "step": 11559 }, { "epoch": 0.9000048659432631, "grad_norm": 0.430345300639917, "learning_rate": 1.0182752258535423e-06, "loss": 0.0353, "step": 11560 }, { "epoch": 0.9000827210354727, "grad_norm": 0.37849890148055176, "learning_rate": 1.0167033167414498e-06, "loss": 0.0291, "step": 11561 }, { "epoch": 0.9001605761276823, "grad_norm": 0.5035329829547947, "learning_rate": 1.0151325901995147e-06, "loss": 0.049, "step": 11562 }, { "epoch": 0.900238431219892, "grad_norm": 0.41755732121665473, "learning_rate": 1.0135630463255786e-06, "loss": 0.0301, "step": 11563 }, { "epoch": 0.9003162863121016, "grad_norm": 0.3178363946854092, "learning_rate": 1.0119946852174189e-06, "loss": 0.0214, "step": 11564 }, { "epoch": 0.9003941414043112, "grad_norm": 0.4263022811053957, "learning_rate": 1.0104275069727398e-06, "loss": 0.0418, "step": 11565 }, { "epoch": 0.9004719964965209, "grad_norm": 0.41588231458831304, "learning_rate": 1.0088615116891697e-06, "loss": 0.0365, "step": 11566 }, { "epoch": 0.9005498515887305, "grad_norm": 0.3222936668081732, "learning_rate": 1.0072966994642618e-06, "loss": 0.0247, "step": 11567 }, { "epoch": 0.90062770668094, "grad_norm": 0.6075590102295116, "learning_rate": 1.0057330703954937e-06, "loss": 0.0406, "step": 11568 }, { "epoch": 0.9007055617731498, "grad_norm": 0.4138680514204485, "learning_rate": 1.0041706245802741e-06, "loss": 0.0337, "step": 11569 }, { "epoch": 0.9007834168653593, "grad_norm": 0.4442707841129658, "learning_rate": 1.0026093621159383e-06, "loss": 0.0448, "step": 11570 }, { "epoch": 0.9008612719575689, "grad_norm": 0.39111898451692867, "learning_rate": 1.0010492830997421e-06, "loss": 0.0301, "step": 11571 }, { "epoch": 0.9009391270497786, "grad_norm": 0.47753630843654404, "learning_rate": 9.994903876288763e-07, "loss": 0.0466, "step": 11572 }, { "epoch": 0.9010169821419882, "grad_norm": 0.43229007595563385, "learning_rate": 9.97932675800448e-07, "loss": 0.035, "step": 11573 }, { "epoch": 0.9010948372341978, "grad_norm": 0.340850091188278, "learning_rate": 9.963761477114908e-07, "loss": 0.026, "step": 11574 }, { "epoch": 0.9011726923264075, "grad_norm": 0.4697225796059186, "learning_rate": 9.948208034589802e-07, "loss": 0.0419, "step": 11575 }, { "epoch": 0.9012505474186171, "grad_norm": 0.514717495835652, "learning_rate": 9.932666431397987e-07, "loss": 0.0481, "step": 11576 }, { "epoch": 0.9013284025108267, "grad_norm": 0.42166240567499913, "learning_rate": 9.917136668507664e-07, "loss": 0.0388, "step": 11577 }, { "epoch": 0.9014062576030364, "grad_norm": 0.39524998469611783, "learning_rate": 9.901618746886266e-07, "loss": 0.0359, "step": 11578 }, { "epoch": 0.901484112695246, "grad_norm": 0.39304366170388466, "learning_rate": 9.886112667500457e-07, "loss": 0.0309, "step": 11579 }, { "epoch": 0.9015619677874556, "grad_norm": 0.42234085872790267, "learning_rate": 9.870618431316226e-07, "loss": 0.0343, "step": 11580 }, { "epoch": 0.9016398228796653, "grad_norm": 0.8333561876624804, "learning_rate": 9.85513603929882e-07, "loss": 0.0588, "step": 11581 }, { "epoch": 0.9017176779718749, "grad_norm": 0.39251533062107347, "learning_rate": 9.839665492412643e-07, "loss": 0.0285, "step": 11582 }, { "epoch": 0.9017955330640844, "grad_norm": 0.5285854290040974, "learning_rate": 9.824206791621482e-07, "loss": 0.0379, "step": 11583 }, { "epoch": 0.9018733881562941, "grad_norm": 0.46048995687465655, "learning_rate": 9.808759937888302e-07, "loss": 0.048, "step": 11584 }, { "epoch": 0.9019512432485037, "grad_norm": 0.6559032317167959, "learning_rate": 9.79332493217542e-07, "loss": 0.0635, "step": 11585 }, { "epoch": 0.9020290983407133, "grad_norm": 0.49041234831823777, "learning_rate": 9.777901775444353e-07, "loss": 0.0493, "step": 11586 }, { "epoch": 0.902106953432923, "grad_norm": 0.38974325127142173, "learning_rate": 9.762490468655894e-07, "loss": 0.0327, "step": 11587 }, { "epoch": 0.9021848085251326, "grad_norm": 0.5046737431967021, "learning_rate": 9.747091012770071e-07, "loss": 0.037, "step": 11588 }, { "epoch": 0.9022626636173422, "grad_norm": 0.4210159733400079, "learning_rate": 9.731703408746162e-07, "loss": 0.038, "step": 11589 }, { "epoch": 0.9023405187095519, "grad_norm": 0.4490282967890602, "learning_rate": 9.716327657542868e-07, "loss": 0.0427, "step": 11590 }, { "epoch": 0.9024183738017615, "grad_norm": 0.5267327058998847, "learning_rate": 9.700963760117933e-07, "loss": 0.0565, "step": 11591 }, { "epoch": 0.9024962288939711, "grad_norm": 0.39403239412508856, "learning_rate": 9.685611717428456e-07, "loss": 0.0327, "step": 11592 }, { "epoch": 0.9025740839861808, "grad_norm": 0.46708935389265643, "learning_rate": 9.670271530430809e-07, "loss": 0.0431, "step": 11593 }, { "epoch": 0.9026519390783904, "grad_norm": 0.5086788738052798, "learning_rate": 9.654943200080623e-07, "loss": 0.0646, "step": 11594 }, { "epoch": 0.9027297941706, "grad_norm": 0.43512029237227706, "learning_rate": 9.63962672733274e-07, "loss": 0.0368, "step": 11595 }, { "epoch": 0.9028076492628095, "grad_norm": 0.36061308131283343, "learning_rate": 9.624322113141371e-07, "loss": 0.0236, "step": 11596 }, { "epoch": 0.9028855043550192, "grad_norm": 0.4453038521001498, "learning_rate": 9.609029358459887e-07, "loss": 0.049, "step": 11597 }, { "epoch": 0.9029633594472288, "grad_norm": 0.6644769563098571, "learning_rate": 9.59374846424097e-07, "loss": 0.0787, "step": 11598 }, { "epoch": 0.9030412145394384, "grad_norm": 0.540987822890813, "learning_rate": 9.578479431436527e-07, "loss": 0.0649, "step": 11599 }, { "epoch": 0.9031190696316481, "grad_norm": 0.44921491274781367, "learning_rate": 9.563222260997752e-07, "loss": 0.043, "step": 11600 }, { "epoch": 0.9031190696316481, "eval_loss": 0.004960049409419298, "eval_runtime": 162.4312, "eval_samples_per_second": 17.731, "eval_steps_per_second": 0.634, "step": 11600 }, { "epoch": 0.9031969247238577, "grad_norm": 0.42537802610130265, "learning_rate": 9.547976953875105e-07, "loss": 0.0358, "step": 11601 }, { "epoch": 0.9032747798160673, "grad_norm": 0.4496925060786417, "learning_rate": 9.532743511018272e-07, "loss": 0.0401, "step": 11602 }, { "epoch": 0.903352634908277, "grad_norm": 0.37349439027218634, "learning_rate": 9.517521933376273e-07, "loss": 0.0335, "step": 11603 }, { "epoch": 0.9034304900004866, "grad_norm": 0.3642191680305872, "learning_rate": 9.502312221897281e-07, "loss": 0.0239, "step": 11604 }, { "epoch": 0.9035083450926962, "grad_norm": 0.46129047336282364, "learning_rate": 9.487114377528806e-07, "loss": 0.0413, "step": 11605 }, { "epoch": 0.9035862001849059, "grad_norm": 0.4592106431930127, "learning_rate": 9.471928401217645e-07, "loss": 0.0448, "step": 11606 }, { "epoch": 0.9036640552771155, "grad_norm": 0.5364396257234522, "learning_rate": 9.456754293909775e-07, "loss": 0.0631, "step": 11607 }, { "epoch": 0.903741910369325, "grad_norm": 0.34084078196603523, "learning_rate": 9.441592056550486e-07, "loss": 0.0283, "step": 11608 }, { "epoch": 0.9038197654615348, "grad_norm": 0.5070646949515006, "learning_rate": 9.42644169008431e-07, "loss": 0.0603, "step": 11609 }, { "epoch": 0.9038976205537443, "grad_norm": 0.41854408937509646, "learning_rate": 9.411303195455046e-07, "loss": 0.0371, "step": 11610 }, { "epoch": 0.9039754756459539, "grad_norm": 0.3744265741337344, "learning_rate": 9.396176573605742e-07, "loss": 0.0278, "step": 11611 }, { "epoch": 0.9040533307381636, "grad_norm": 0.3007546084231099, "learning_rate": 9.38106182547871e-07, "loss": 0.0225, "step": 11612 }, { "epoch": 0.9041311858303732, "grad_norm": 0.39977380186321276, "learning_rate": 9.365958952015553e-07, "loss": 0.0335, "step": 11613 }, { "epoch": 0.9042090409225828, "grad_norm": 0.4797629884129433, "learning_rate": 9.350867954157095e-07, "loss": 0.05, "step": 11614 }, { "epoch": 0.9042868960147925, "grad_norm": 0.5658195726698494, "learning_rate": 9.335788832843407e-07, "loss": 0.0643, "step": 11615 }, { "epoch": 0.9043647511070021, "grad_norm": 0.5945180751036337, "learning_rate": 9.320721589013892e-07, "loss": 0.0607, "step": 11616 }, { "epoch": 0.9044426061992117, "grad_norm": 0.4366159880718355, "learning_rate": 9.305666223607157e-07, "loss": 0.0403, "step": 11617 }, { "epoch": 0.9045204612914214, "grad_norm": 0.37077578600565025, "learning_rate": 9.29062273756105e-07, "loss": 0.0263, "step": 11618 }, { "epoch": 0.904598316383631, "grad_norm": 0.3906164529821106, "learning_rate": 9.275591131812756e-07, "loss": 0.0272, "step": 11619 }, { "epoch": 0.9046761714758406, "grad_norm": 0.5768610632373155, "learning_rate": 9.260571407298657e-07, "loss": 0.0649, "step": 11620 }, { "epoch": 0.9047540265680503, "grad_norm": 0.37653300483635843, "learning_rate": 9.245563564954385e-07, "loss": 0.0263, "step": 11621 }, { "epoch": 0.9048318816602599, "grad_norm": 0.3563459755046728, "learning_rate": 9.230567605714902e-07, "loss": 0.0215, "step": 11622 }, { "epoch": 0.9049097367524694, "grad_norm": 0.34102010841306, "learning_rate": 9.215583530514327e-07, "loss": 0.0233, "step": 11623 }, { "epoch": 0.9049875918446791, "grad_norm": 0.36297517439286353, "learning_rate": 9.200611340286159e-07, "loss": 0.0268, "step": 11624 }, { "epoch": 0.9050654469368887, "grad_norm": 0.5667150798927341, "learning_rate": 9.185651035963027e-07, "loss": 0.0625, "step": 11625 }, { "epoch": 0.9051433020290983, "grad_norm": 0.35114767159445653, "learning_rate": 9.170702618476968e-07, "loss": 0.0328, "step": 11626 }, { "epoch": 0.905221157121308, "grad_norm": 0.463750767668718, "learning_rate": 9.155766088759166e-07, "loss": 0.0563, "step": 11627 }, { "epoch": 0.9052990122135176, "grad_norm": 0.3474395754009698, "learning_rate": 9.140841447740078e-07, "loss": 0.0295, "step": 11628 }, { "epoch": 0.9053768673057272, "grad_norm": 0.5512709859380924, "learning_rate": 9.125928696349474e-07, "loss": 0.0571, "step": 11629 }, { "epoch": 0.9054547223979369, "grad_norm": 0.3975145123139862, "learning_rate": 9.111027835516273e-07, "loss": 0.0369, "step": 11630 }, { "epoch": 0.9055325774901465, "grad_norm": 0.47957368280242957, "learning_rate": 9.096138866168802e-07, "loss": 0.0431, "step": 11631 }, { "epoch": 0.9056104325823561, "grad_norm": 0.41403861064262976, "learning_rate": 9.081261789234563e-07, "loss": 0.0353, "step": 11632 }, { "epoch": 0.9056882876745657, "grad_norm": 0.4056449885427289, "learning_rate": 9.066396605640305e-07, "loss": 0.0285, "step": 11633 }, { "epoch": 0.9057661427667754, "grad_norm": 0.531134058458143, "learning_rate": 9.051543316312084e-07, "loss": 0.0594, "step": 11634 }, { "epoch": 0.905843997858985, "grad_norm": 0.34818663613807477, "learning_rate": 9.036701922175184e-07, "loss": 0.0233, "step": 11635 }, { "epoch": 0.9059218529511945, "grad_norm": 0.36577333076477553, "learning_rate": 9.02187242415411e-07, "loss": 0.0269, "step": 11636 }, { "epoch": 0.9059997080434042, "grad_norm": 0.35733703066873274, "learning_rate": 9.007054823172723e-07, "loss": 0.0299, "step": 11637 }, { "epoch": 0.9060775631356138, "grad_norm": 0.39200591151254327, "learning_rate": 8.992249120154107e-07, "loss": 0.0277, "step": 11638 }, { "epoch": 0.9061554182278234, "grad_norm": 0.40060311073659355, "learning_rate": 8.977455316020545e-07, "loss": 0.0328, "step": 11639 }, { "epoch": 0.9062332733200331, "grad_norm": 0.38694063620112146, "learning_rate": 8.962673411693612e-07, "loss": 0.029, "step": 11640 }, { "epoch": 0.9063111284122427, "grad_norm": 0.33875725653005184, "learning_rate": 8.947903408094149e-07, "loss": 0.0238, "step": 11641 }, { "epoch": 0.9063889835044523, "grad_norm": 0.44672005480304405, "learning_rate": 8.93314530614231e-07, "loss": 0.0412, "step": 11642 }, { "epoch": 0.906466838596662, "grad_norm": 0.4704520481360441, "learning_rate": 8.918399106757425e-07, "loss": 0.0466, "step": 11643 }, { "epoch": 0.9065446936888716, "grad_norm": 0.5171536914030118, "learning_rate": 8.903664810858115e-07, "loss": 0.0585, "step": 11644 }, { "epoch": 0.9066225487810812, "grad_norm": 0.4403725208782376, "learning_rate": 8.888942419362245e-07, "loss": 0.0378, "step": 11645 }, { "epoch": 0.9067004038732909, "grad_norm": 0.7094611119189181, "learning_rate": 8.87423193318695e-07, "loss": 0.0949, "step": 11646 }, { "epoch": 0.9067782589655005, "grad_norm": 0.4783172595975721, "learning_rate": 8.859533353248673e-07, "loss": 0.05, "step": 11647 }, { "epoch": 0.90685611405771, "grad_norm": 0.4560351273823873, "learning_rate": 8.844846680463015e-07, "loss": 0.0377, "step": 11648 }, { "epoch": 0.9069339691499197, "grad_norm": 0.3676560293577226, "learning_rate": 8.830171915744889e-07, "loss": 0.0245, "step": 11649 }, { "epoch": 0.9070118242421293, "grad_norm": 0.4638763939309758, "learning_rate": 8.815509060008476e-07, "loss": 0.0361, "step": 11650 }, { "epoch": 0.9070118242421293, "eval_loss": 0.004940858576446772, "eval_runtime": 162.1843, "eval_samples_per_second": 17.758, "eval_steps_per_second": 0.635, "step": 11650 }, { "epoch": 0.9070896793343389, "grad_norm": 0.4810009408631267, "learning_rate": 8.800858114167177e-07, "loss": 0.0305, "step": 11651 }, { "epoch": 0.9071675344265486, "grad_norm": 0.4522119219929504, "learning_rate": 8.786219079133751e-07, "loss": 0.0325, "step": 11652 }, { "epoch": 0.9072453895187582, "grad_norm": 0.4652212425362043, "learning_rate": 8.771591955820069e-07, "loss": 0.0416, "step": 11653 }, { "epoch": 0.9073232446109678, "grad_norm": 0.43788192789349467, "learning_rate": 8.756976745137358e-07, "loss": 0.0368, "step": 11654 }, { "epoch": 0.9074010997031775, "grad_norm": 0.44444107002363964, "learning_rate": 8.742373447996067e-07, "loss": 0.0429, "step": 11655 }, { "epoch": 0.9074789547953871, "grad_norm": 0.359641419228595, "learning_rate": 8.727782065305935e-07, "loss": 0.029, "step": 11656 }, { "epoch": 0.9075568098875967, "grad_norm": 0.4096624429868637, "learning_rate": 8.713202597975901e-07, "loss": 0.037, "step": 11657 }, { "epoch": 0.9076346649798064, "grad_norm": 0.5601182823331384, "learning_rate": 8.698635046914239e-07, "loss": 0.0464, "step": 11658 }, { "epoch": 0.907712520072016, "grad_norm": 0.48425462198461083, "learning_rate": 8.684079413028401e-07, "loss": 0.0399, "step": 11659 }, { "epoch": 0.9077903751642256, "grad_norm": 0.46392652452907773, "learning_rate": 8.669535697225174e-07, "loss": 0.0454, "step": 11660 }, { "epoch": 0.9078682302564353, "grad_norm": 0.48812798770453514, "learning_rate": 8.655003900410497e-07, "loss": 0.047, "step": 11661 }, { "epoch": 0.9079460853486448, "grad_norm": 0.42285775569962714, "learning_rate": 8.640484023489692e-07, "loss": 0.0416, "step": 11662 }, { "epoch": 0.9080239404408544, "grad_norm": 0.41456159561601874, "learning_rate": 8.625976067367281e-07, "loss": 0.0389, "step": 11663 }, { "epoch": 0.9081017955330641, "grad_norm": 0.4886145626276117, "learning_rate": 8.611480032947028e-07, "loss": 0.0541, "step": 11664 }, { "epoch": 0.9081796506252737, "grad_norm": 0.4396621133190478, "learning_rate": 8.596995921131945e-07, "loss": 0.0396, "step": 11665 }, { "epoch": 0.9082575057174833, "grad_norm": 0.44542081474935075, "learning_rate": 8.582523732824355e-07, "loss": 0.0391, "step": 11666 }, { "epoch": 0.9083353608096929, "grad_norm": 0.3204420348117637, "learning_rate": 8.568063468925803e-07, "loss": 0.0191, "step": 11667 }, { "epoch": 0.9084132159019026, "grad_norm": 0.4235738044764621, "learning_rate": 8.553615130337056e-07, "loss": 0.0328, "step": 11668 }, { "epoch": 0.9084910709941122, "grad_norm": 0.43376256311652117, "learning_rate": 8.539178717958241e-07, "loss": 0.0392, "step": 11669 }, { "epoch": 0.9085689260863218, "grad_norm": 0.48599015346518537, "learning_rate": 8.524754232688637e-07, "loss": 0.0466, "step": 11670 }, { "epoch": 0.9086467811785315, "grad_norm": 0.409172757806546, "learning_rate": 8.510341675426815e-07, "loss": 0.0376, "step": 11671 }, { "epoch": 0.9087246362707411, "grad_norm": 0.3603338455995818, "learning_rate": 8.495941047070633e-07, "loss": 0.0236, "step": 11672 }, { "epoch": 0.9088024913629507, "grad_norm": 0.4284441011169492, "learning_rate": 8.481552348517175e-07, "loss": 0.0458, "step": 11673 }, { "epoch": 0.9088803464551604, "grad_norm": 0.4751996301920126, "learning_rate": 8.46717558066279e-07, "loss": 0.039, "step": 11674 }, { "epoch": 0.90895820154737, "grad_norm": 0.4471014329629447, "learning_rate": 8.452810744403095e-07, "loss": 0.0427, "step": 11675 }, { "epoch": 0.9090360566395795, "grad_norm": 0.40257469986695127, "learning_rate": 8.43845784063293e-07, "loss": 0.0357, "step": 11676 }, { "epoch": 0.9091139117317892, "grad_norm": 0.3973130041331278, "learning_rate": 8.424116870246424e-07, "loss": 0.0293, "step": 11677 }, { "epoch": 0.9091917668239988, "grad_norm": 0.3773753599521012, "learning_rate": 8.409787834136951e-07, "loss": 0.0355, "step": 11678 }, { "epoch": 0.9092696219162084, "grad_norm": 0.3672255273151425, "learning_rate": 8.395470733197153e-07, "loss": 0.0259, "step": 11679 }, { "epoch": 0.9093474770084181, "grad_norm": 0.4623716407302227, "learning_rate": 8.381165568318894e-07, "loss": 0.0308, "step": 11680 }, { "epoch": 0.9094253321006277, "grad_norm": 0.31895368271439867, "learning_rate": 8.366872340393329e-07, "loss": 0.0169, "step": 11681 }, { "epoch": 0.9095031871928373, "grad_norm": 0.42713622726068673, "learning_rate": 8.352591050310854e-07, "loss": 0.0305, "step": 11682 }, { "epoch": 0.909581042285047, "grad_norm": 0.5731921591172183, "learning_rate": 8.338321698961138e-07, "loss": 0.0639, "step": 11683 }, { "epoch": 0.9096588973772566, "grad_norm": 0.49698940644536516, "learning_rate": 8.324064287233114e-07, "loss": 0.051, "step": 11684 }, { "epoch": 0.9097367524694662, "grad_norm": 0.5083443355244158, "learning_rate": 8.309818816014914e-07, "loss": 0.0611, "step": 11685 }, { "epoch": 0.9098146075616759, "grad_norm": 0.4276160858206205, "learning_rate": 8.295585286193985e-07, "loss": 0.0431, "step": 11686 }, { "epoch": 0.9098924626538855, "grad_norm": 0.39093824241529224, "learning_rate": 8.281363698656997e-07, "loss": 0.0226, "step": 11687 }, { "epoch": 0.909970317746095, "grad_norm": 0.5282664861606104, "learning_rate": 8.267154054289883e-07, "loss": 0.0496, "step": 11688 }, { "epoch": 0.9100481728383047, "grad_norm": 0.44855646116069386, "learning_rate": 8.25295635397787e-07, "loss": 0.0497, "step": 11689 }, { "epoch": 0.9101260279305143, "grad_norm": 0.42207340598610205, "learning_rate": 8.238770598605383e-07, "loss": 0.0345, "step": 11690 }, { "epoch": 0.9102038830227239, "grad_norm": 0.4489968280700913, "learning_rate": 8.224596789056116e-07, "loss": 0.0404, "step": 11691 }, { "epoch": 0.9102817381149336, "grad_norm": 0.4238521661711986, "learning_rate": 8.210434926213029e-07, "loss": 0.0305, "step": 11692 }, { "epoch": 0.9103595932071432, "grad_norm": 0.43193084180499625, "learning_rate": 8.196285010958371e-07, "loss": 0.0477, "step": 11693 }, { "epoch": 0.9104374482993528, "grad_norm": 0.41138842813708837, "learning_rate": 8.182147044173617e-07, "loss": 0.0344, "step": 11694 }, { "epoch": 0.9105153033915625, "grad_norm": 0.34686517614722456, "learning_rate": 8.168021026739459e-07, "loss": 0.0236, "step": 11695 }, { "epoch": 0.9105931584837721, "grad_norm": 0.6544783404192511, "learning_rate": 8.153906959535928e-07, "loss": 0.0557, "step": 11696 }, { "epoch": 0.9106710135759817, "grad_norm": 0.38716204067258925, "learning_rate": 8.139804843442189e-07, "loss": 0.0292, "step": 11697 }, { "epoch": 0.9107488686681914, "grad_norm": 0.6105945995631555, "learning_rate": 8.125714679336827e-07, "loss": 0.0659, "step": 11698 }, { "epoch": 0.910826723760401, "grad_norm": 0.4048821289229902, "learning_rate": 8.111636468097539e-07, "loss": 0.0353, "step": 11699 }, { "epoch": 0.9109045788526106, "grad_norm": 0.4755505551299845, "learning_rate": 8.097570210601335e-07, "loss": 0.0485, "step": 11700 }, { "epoch": 0.9109045788526106, "eval_loss": 0.0049150437116622925, "eval_runtime": 162.4162, "eval_samples_per_second": 17.732, "eval_steps_per_second": 0.634, "step": 11700 }, { "epoch": 0.9109824339448203, "grad_norm": 0.4608624015215782, "learning_rate": 8.083515907724515e-07, "loss": 0.0427, "step": 11701 }, { "epoch": 0.9110602890370298, "grad_norm": 0.37938507141254296, "learning_rate": 8.069473560342512e-07, "loss": 0.0372, "step": 11702 }, { "epoch": 0.9111381441292394, "grad_norm": 0.36072110429887216, "learning_rate": 8.055443169330201e-07, "loss": 0.0253, "step": 11703 }, { "epoch": 0.911215999221449, "grad_norm": 0.5358197544889017, "learning_rate": 8.041424735561531e-07, "loss": 0.0582, "step": 11704 }, { "epoch": 0.9112938543136587, "grad_norm": 0.46362846401599406, "learning_rate": 8.027418259909869e-07, "loss": 0.0507, "step": 11705 }, { "epoch": 0.9113717094058683, "grad_norm": 0.43411371259120796, "learning_rate": 8.013423743247673e-07, "loss": 0.04, "step": 11706 }, { "epoch": 0.9114495644980779, "grad_norm": 0.401076724796709, "learning_rate": 7.999441186446711e-07, "loss": 0.0257, "step": 11707 }, { "epoch": 0.9115274195902876, "grad_norm": 0.4609182451754373, "learning_rate": 7.98547059037813e-07, "loss": 0.037, "step": 11708 }, { "epoch": 0.9116052746824972, "grad_norm": 0.3866747789548587, "learning_rate": 7.97151195591217e-07, "loss": 0.0311, "step": 11709 }, { "epoch": 0.9116831297747068, "grad_norm": 0.3746676951616762, "learning_rate": 7.9575652839184e-07, "loss": 0.0243, "step": 11710 }, { "epoch": 0.9117609848669165, "grad_norm": 0.4657258060776218, "learning_rate": 7.943630575265637e-07, "loss": 0.0478, "step": 11711 }, { "epoch": 0.9118388399591261, "grad_norm": 0.3778335326695217, "learning_rate": 7.929707830821965e-07, "loss": 0.0331, "step": 11712 }, { "epoch": 0.9119166950513357, "grad_norm": 0.3806296994417585, "learning_rate": 7.915797051454643e-07, "loss": 0.029, "step": 11713 }, { "epoch": 0.9119945501435454, "grad_norm": 0.29173029281110435, "learning_rate": 7.901898238030336e-07, "loss": 0.0206, "step": 11714 }, { "epoch": 0.912072405235755, "grad_norm": 0.3728951277213142, "learning_rate": 7.888011391414796e-07, "loss": 0.0209, "step": 11715 }, { "epoch": 0.9121502603279645, "grad_norm": 0.37506141725451975, "learning_rate": 7.874136512473151e-07, "loss": 0.0323, "step": 11716 }, { "epoch": 0.9122281154201742, "grad_norm": 0.312875778651148, "learning_rate": 7.860273602069735e-07, "loss": 0.0189, "step": 11717 }, { "epoch": 0.9123059705123838, "grad_norm": 0.40129171343793196, "learning_rate": 7.846422661068098e-07, "loss": 0.0328, "step": 11718 }, { "epoch": 0.9123838256045934, "grad_norm": 0.38241937036505935, "learning_rate": 7.832583690331152e-07, "loss": 0.0253, "step": 11719 }, { "epoch": 0.9124616806968031, "grad_norm": 0.4095090466151796, "learning_rate": 7.818756690720985e-07, "loss": 0.0341, "step": 11720 }, { "epoch": 0.9125395357890127, "grad_norm": 0.35297553914385044, "learning_rate": 7.804941663098953e-07, "loss": 0.0295, "step": 11721 }, { "epoch": 0.9126173908812223, "grad_norm": 0.4601059809503186, "learning_rate": 7.791138608325655e-07, "loss": 0.0592, "step": 11722 }, { "epoch": 0.912695245973432, "grad_norm": 0.33247505205861766, "learning_rate": 7.777347527260936e-07, "loss": 0.0184, "step": 11723 }, { "epoch": 0.9127731010656416, "grad_norm": 0.5874501439923093, "learning_rate": 7.763568420763978e-07, "loss": 0.059, "step": 11724 }, { "epoch": 0.9128509561578512, "grad_norm": 0.516577801506794, "learning_rate": 7.749801289693093e-07, "loss": 0.0578, "step": 11725 }, { "epoch": 0.9129288112500609, "grad_norm": 0.4432299099485143, "learning_rate": 7.736046134905928e-07, "loss": 0.0474, "step": 11726 }, { "epoch": 0.9130066663422705, "grad_norm": 0.5576990949072109, "learning_rate": 7.722302957259375e-07, "loss": 0.0588, "step": 11727 }, { "epoch": 0.91308452143448, "grad_norm": 0.3845125374295577, "learning_rate": 7.708571757609529e-07, "loss": 0.0329, "step": 11728 }, { "epoch": 0.9131623765266897, "grad_norm": 0.45413614795666024, "learning_rate": 7.694852536811859e-07, "loss": 0.0484, "step": 11729 }, { "epoch": 0.9132402316188993, "grad_norm": 0.36131064706424876, "learning_rate": 7.681145295720948e-07, "loss": 0.0354, "step": 11730 }, { "epoch": 0.9133180867111089, "grad_norm": 0.4650531614445502, "learning_rate": 7.667450035190715e-07, "loss": 0.0473, "step": 11731 }, { "epoch": 0.9133959418033186, "grad_norm": 0.42124345038952227, "learning_rate": 7.653766756074299e-07, "loss": 0.0331, "step": 11732 }, { "epoch": 0.9134737968955282, "grad_norm": 0.49749034469110526, "learning_rate": 7.640095459224106e-07, "loss": 0.0575, "step": 11733 }, { "epoch": 0.9135516519877378, "grad_norm": 0.4388701177429022, "learning_rate": 7.626436145491789e-07, "loss": 0.0425, "step": 11734 }, { "epoch": 0.9136295070799475, "grad_norm": 0.418053920656785, "learning_rate": 7.612788815728267e-07, "loss": 0.0363, "step": 11735 }, { "epoch": 0.9137073621721571, "grad_norm": 0.3787549488492863, "learning_rate": 7.599153470783705e-07, "loss": 0.0276, "step": 11736 }, { "epoch": 0.9137852172643667, "grad_norm": 0.36892947715190844, "learning_rate": 7.58553011150751e-07, "loss": 0.0257, "step": 11737 }, { "epoch": 0.9138630723565763, "grad_norm": 0.4346038755117769, "learning_rate": 7.57191873874834e-07, "loss": 0.0377, "step": 11738 }, { "epoch": 0.913940927448786, "grad_norm": 0.45737155501389487, "learning_rate": 7.55831935335416e-07, "loss": 0.0358, "step": 11739 }, { "epoch": 0.9140187825409956, "grad_norm": 0.49330385298426144, "learning_rate": 7.544731956172135e-07, "loss": 0.0508, "step": 11740 }, { "epoch": 0.9140966376332051, "grad_norm": 0.35362981289858136, "learning_rate": 7.531156548048702e-07, "loss": 0.0275, "step": 11741 }, { "epoch": 0.9141744927254148, "grad_norm": 0.5721335989807531, "learning_rate": 7.517593129829515e-07, "loss": 0.0522, "step": 11742 }, { "epoch": 0.9142523478176244, "grad_norm": 0.4070537993650106, "learning_rate": 7.504041702359544e-07, "loss": 0.0276, "step": 11743 }, { "epoch": 0.914330202909834, "grad_norm": 0.43981472116998777, "learning_rate": 7.490502266482958e-07, "loss": 0.0375, "step": 11744 }, { "epoch": 0.9144080580020437, "grad_norm": 0.2789355922458818, "learning_rate": 7.476974823043193e-07, "loss": 0.0109, "step": 11745 }, { "epoch": 0.9144859130942533, "grad_norm": 0.4643648279398792, "learning_rate": 7.463459372882976e-07, "loss": 0.0474, "step": 11746 }, { "epoch": 0.9145637681864629, "grad_norm": 0.4574174919950849, "learning_rate": 7.449955916844231e-07, "loss": 0.0315, "step": 11747 }, { "epoch": 0.9146416232786726, "grad_norm": 0.3535347215701558, "learning_rate": 7.436464455768156e-07, "loss": 0.0217, "step": 11748 }, { "epoch": 0.9147194783708822, "grad_norm": 0.4240471849081252, "learning_rate": 7.422984990495208e-07, "loss": 0.0353, "step": 11749 }, { "epoch": 0.9147973334630918, "grad_norm": 0.36555809014971263, "learning_rate": 7.409517521865118e-07, "loss": 0.0293, "step": 11750 }, { "epoch": 0.9147973334630918, "eval_loss": 0.004896061960607767, "eval_runtime": 162.9044, "eval_samples_per_second": 17.679, "eval_steps_per_second": 0.632, "step": 11750 }, { "epoch": 0.9148751885553015, "grad_norm": 0.5082337728545656, "learning_rate": 7.396062050716835e-07, "loss": 0.0541, "step": 11751 }, { "epoch": 0.9149530436475111, "grad_norm": 0.38659067776077954, "learning_rate": 7.382618577888579e-07, "loss": 0.027, "step": 11752 }, { "epoch": 0.9150308987397207, "grad_norm": 0.3961575258015876, "learning_rate": 7.36918710421779e-07, "loss": 0.0358, "step": 11753 }, { "epoch": 0.9151087538319304, "grad_norm": 0.4578185747600462, "learning_rate": 7.3557676305412e-07, "loss": 0.0461, "step": 11754 }, { "epoch": 0.9151866089241399, "grad_norm": 0.4154910063373735, "learning_rate": 7.342360157694784e-07, "loss": 0.0368, "step": 11755 }, { "epoch": 0.9152644640163495, "grad_norm": 0.45788316056068923, "learning_rate": 7.328964686513762e-07, "loss": 0.0414, "step": 11756 }, { "epoch": 0.9153423191085592, "grad_norm": 0.5242748735103557, "learning_rate": 7.315581217832601e-07, "loss": 0.064, "step": 11757 }, { "epoch": 0.9154201742007688, "grad_norm": 0.3709609865037594, "learning_rate": 7.302209752485035e-07, "loss": 0.0284, "step": 11758 }, { "epoch": 0.9154980292929784, "grad_norm": 0.42739418277463864, "learning_rate": 7.288850291304017e-07, "loss": 0.0414, "step": 11759 }, { "epoch": 0.9155758843851881, "grad_norm": 0.4290632288063904, "learning_rate": 7.275502835121818e-07, "loss": 0.039, "step": 11760 }, { "epoch": 0.9156537394773977, "grad_norm": 0.5096438853431142, "learning_rate": 7.262167384769925e-07, "loss": 0.0466, "step": 11761 }, { "epoch": 0.9157315945696073, "grad_norm": 0.5290075296535347, "learning_rate": 7.248843941079053e-07, "loss": 0.0574, "step": 11762 }, { "epoch": 0.915809449661817, "grad_norm": 0.3039843498847219, "learning_rate": 7.235532504879206e-07, "loss": 0.0155, "step": 11763 }, { "epoch": 0.9158873047540266, "grad_norm": 0.40158286384724506, "learning_rate": 7.222233076999563e-07, "loss": 0.0292, "step": 11764 }, { "epoch": 0.9159651598462362, "grad_norm": 0.4830688107434883, "learning_rate": 7.208945658268706e-07, "loss": 0.0365, "step": 11765 }, { "epoch": 0.9160430149384459, "grad_norm": 0.4972073811295138, "learning_rate": 7.195670249514308e-07, "loss": 0.0495, "step": 11766 }, { "epoch": 0.9161208700306555, "grad_norm": 0.5292689772303739, "learning_rate": 7.182406851563417e-07, "loss": 0.0446, "step": 11767 }, { "epoch": 0.916198725122865, "grad_norm": 0.3933496397054274, "learning_rate": 7.169155465242261e-07, "loss": 0.0215, "step": 11768 }, { "epoch": 0.9162765802150747, "grad_norm": 0.4866503034036796, "learning_rate": 7.155916091376291e-07, "loss": 0.0384, "step": 11769 }, { "epoch": 0.9163544353072843, "grad_norm": 0.4769273622249403, "learning_rate": 7.142688730790337e-07, "loss": 0.0389, "step": 11770 }, { "epoch": 0.9164322903994939, "grad_norm": 0.43856359860575755, "learning_rate": 7.129473384308361e-07, "loss": 0.041, "step": 11771 }, { "epoch": 0.9165101454917036, "grad_norm": 0.4598305685721035, "learning_rate": 7.116270052753638e-07, "loss": 0.0356, "step": 11772 }, { "epoch": 0.9165880005839132, "grad_norm": 0.5674847052077185, "learning_rate": 7.103078736948643e-07, "loss": 0.0471, "step": 11773 }, { "epoch": 0.9166658556761228, "grad_norm": 0.38773228350981526, "learning_rate": 7.08989943771512e-07, "loss": 0.0232, "step": 11774 }, { "epoch": 0.9167437107683324, "grad_norm": 0.3277662920096019, "learning_rate": 7.076732155874122e-07, "loss": 0.0198, "step": 11775 }, { "epoch": 0.9168215658605421, "grad_norm": 0.39495891876649264, "learning_rate": 7.063576892245904e-07, "loss": 0.0296, "step": 11776 }, { "epoch": 0.9168994209527517, "grad_norm": 0.43250248280391906, "learning_rate": 7.050433647649968e-07, "loss": 0.0325, "step": 11777 }, { "epoch": 0.9169772760449613, "grad_norm": 0.5478699714452535, "learning_rate": 7.037302422905057e-07, "loss": 0.0526, "step": 11778 }, { "epoch": 0.917055131137171, "grad_norm": 0.2799796014250886, "learning_rate": 7.024183218829184e-07, "loss": 0.0105, "step": 11779 }, { "epoch": 0.9171329862293806, "grad_norm": 0.4474071511201407, "learning_rate": 7.01107603623965e-07, "loss": 0.0312, "step": 11780 }, { "epoch": 0.9172108413215901, "grad_norm": 0.3271915488489217, "learning_rate": 6.997980875952981e-07, "loss": 0.0217, "step": 11781 }, { "epoch": 0.9172886964137998, "grad_norm": 0.43820409551247236, "learning_rate": 6.98489773878488e-07, "loss": 0.0347, "step": 11782 }, { "epoch": 0.9173665515060094, "grad_norm": 0.6195092127192245, "learning_rate": 6.971826625550404e-07, "loss": 0.0642, "step": 11783 }, { "epoch": 0.917444406598219, "grad_norm": 0.3614038340663524, "learning_rate": 6.958767537063793e-07, "loss": 0.0306, "step": 11784 }, { "epoch": 0.9175222616904287, "grad_norm": 0.5683588063910859, "learning_rate": 6.945720474138618e-07, "loss": 0.056, "step": 11785 }, { "epoch": 0.9176001167826383, "grad_norm": 0.5105964188254887, "learning_rate": 6.932685437587627e-07, "loss": 0.0545, "step": 11786 }, { "epoch": 0.9176779718748479, "grad_norm": 0.4310596452625658, "learning_rate": 6.919662428222817e-07, "loss": 0.0379, "step": 11787 }, { "epoch": 0.9177558269670576, "grad_norm": 0.45803959499725516, "learning_rate": 6.906651446855495e-07, "loss": 0.0439, "step": 11788 }, { "epoch": 0.9178336820592672, "grad_norm": 0.47769921490693634, "learning_rate": 6.893652494296165e-07, "loss": 0.0464, "step": 11789 }, { "epoch": 0.9179115371514768, "grad_norm": 0.4763266047158307, "learning_rate": 6.880665571354605e-07, "loss": 0.0409, "step": 11790 }, { "epoch": 0.9179893922436865, "grad_norm": 0.4028262517174096, "learning_rate": 6.867690678839856e-07, "loss": 0.0297, "step": 11791 }, { "epoch": 0.9180672473358961, "grad_norm": 0.3112086331952692, "learning_rate": 6.854727817560159e-07, "loss": 0.0212, "step": 11792 }, { "epoch": 0.9181451024281057, "grad_norm": 0.49487798858509896, "learning_rate": 6.841776988323046e-07, "loss": 0.0406, "step": 11793 }, { "epoch": 0.9182229575203154, "grad_norm": 0.37423386067532577, "learning_rate": 6.828838191935316e-07, "loss": 0.0361, "step": 11794 }, { "epoch": 0.9183008126125249, "grad_norm": 0.2882822192992199, "learning_rate": 6.815911429202949e-07, "loss": 0.0219, "step": 11795 }, { "epoch": 0.9183786677047345, "grad_norm": 0.5113642029730958, "learning_rate": 6.802996700931297e-07, "loss": 0.0413, "step": 11796 }, { "epoch": 0.9184565227969442, "grad_norm": 0.3928623476331103, "learning_rate": 6.790094007924831e-07, "loss": 0.0323, "step": 11797 }, { "epoch": 0.9185343778891538, "grad_norm": 0.37236727002223496, "learning_rate": 6.777203350987327e-07, "loss": 0.0288, "step": 11798 }, { "epoch": 0.9186122329813634, "grad_norm": 0.3816010695427827, "learning_rate": 6.764324730921856e-07, "loss": 0.0316, "step": 11799 }, { "epoch": 0.9186900880735731, "grad_norm": 0.325951991057172, "learning_rate": 6.751458148530643e-07, "loss": 0.022, "step": 11800 }, { "epoch": 0.9186900880735731, "eval_loss": 0.004886949434876442, "eval_runtime": 162.9878, "eval_samples_per_second": 17.67, "eval_steps_per_second": 0.632, "step": 11800 }, { "epoch": 0.9187679431657827, "grad_norm": 0.5065184185308306, "learning_rate": 6.738603604615245e-07, "loss": 0.0465, "step": 11801 }, { "epoch": 0.9188457982579923, "grad_norm": 0.4396146614628366, "learning_rate": 6.725761099976447e-07, "loss": 0.0389, "step": 11802 }, { "epoch": 0.918923653350202, "grad_norm": 0.4338437220357391, "learning_rate": 6.71293063541425e-07, "loss": 0.0309, "step": 11803 }, { "epoch": 0.9190015084424116, "grad_norm": 0.46471667057411176, "learning_rate": 6.700112211727949e-07, "loss": 0.0359, "step": 11804 }, { "epoch": 0.9190793635346212, "grad_norm": 0.5094494297293757, "learning_rate": 6.68730582971604e-07, "loss": 0.0501, "step": 11805 }, { "epoch": 0.9191572186268309, "grad_norm": 0.3712906371324401, "learning_rate": 6.674511490176372e-07, "loss": 0.0228, "step": 11806 }, { "epoch": 0.9192350737190405, "grad_norm": 0.47819398399123864, "learning_rate": 6.661729193905908e-07, "loss": 0.0443, "step": 11807 }, { "epoch": 0.91931292881125, "grad_norm": 0.3832918502050155, "learning_rate": 6.648958941700945e-07, "loss": 0.0265, "step": 11808 }, { "epoch": 0.9193907839034597, "grad_norm": 0.3937289186766129, "learning_rate": 6.636200734357002e-07, "loss": 0.0387, "step": 11809 }, { "epoch": 0.9194686389956693, "grad_norm": 0.44509072014880563, "learning_rate": 6.623454572668886e-07, "loss": 0.0383, "step": 11810 }, { "epoch": 0.9195464940878789, "grad_norm": 0.35201781060407267, "learning_rate": 6.610720457430586e-07, "loss": 0.0218, "step": 11811 }, { "epoch": 0.9196243491800885, "grad_norm": 0.42123898027733536, "learning_rate": 6.5979983894354e-07, "loss": 0.0288, "step": 11812 }, { "epoch": 0.9197022042722982, "grad_norm": 0.4444369312656896, "learning_rate": 6.585288369475829e-07, "loss": 0.0361, "step": 11813 }, { "epoch": 0.9197800593645078, "grad_norm": 0.46094006279218364, "learning_rate": 6.572590398343681e-07, "loss": 0.0418, "step": 11814 }, { "epoch": 0.9198579144567174, "grad_norm": 0.35538393689657394, "learning_rate": 6.559904476829925e-07, "loss": 0.0271, "step": 11815 }, { "epoch": 0.9199357695489271, "grad_norm": 0.5330582908660141, "learning_rate": 6.547230605724908e-07, "loss": 0.054, "step": 11816 }, { "epoch": 0.9200136246411367, "grad_norm": 0.36305906377122465, "learning_rate": 6.534568785818108e-07, "loss": 0.0257, "step": 11817 }, { "epoch": 0.9200914797333463, "grad_norm": 0.5193652703004541, "learning_rate": 6.521919017898292e-07, "loss": 0.0632, "step": 11818 }, { "epoch": 0.920169334825556, "grad_norm": 0.41748194526802984, "learning_rate": 6.5092813027535e-07, "loss": 0.0353, "step": 11819 }, { "epoch": 0.9202471899177656, "grad_norm": 0.4317079654549608, "learning_rate": 6.496655641171013e-07, "loss": 0.0409, "step": 11820 }, { "epoch": 0.9203250450099751, "grad_norm": 0.3787667080368523, "learning_rate": 6.484042033937332e-07, "loss": 0.038, "step": 11821 }, { "epoch": 0.9204029001021848, "grad_norm": 0.41701019912692955, "learning_rate": 6.471440481838209e-07, "loss": 0.0333, "step": 11822 }, { "epoch": 0.9204807551943944, "grad_norm": 0.34563459830656196, "learning_rate": 6.458850985658683e-07, "loss": 0.0191, "step": 11823 }, { "epoch": 0.920558610286604, "grad_norm": 0.38150930541204253, "learning_rate": 6.446273546183035e-07, "loss": 0.0283, "step": 11824 }, { "epoch": 0.9206364653788137, "grad_norm": 0.45474595669597295, "learning_rate": 6.433708164194751e-07, "loss": 0.0343, "step": 11825 }, { "epoch": 0.9207143204710233, "grad_norm": 0.37429211454444056, "learning_rate": 6.421154840476584e-07, "loss": 0.0198, "step": 11826 }, { "epoch": 0.9207921755632329, "grad_norm": 0.3475091249850394, "learning_rate": 6.408613575810574e-07, "loss": 0.0275, "step": 11827 }, { "epoch": 0.9208700306554426, "grad_norm": 0.3436121390710955, "learning_rate": 6.396084370978006e-07, "loss": 0.0245, "step": 11828 }, { "epoch": 0.9209478857476522, "grad_norm": 0.3643489855115296, "learning_rate": 6.383567226759346e-07, "loss": 0.0251, "step": 11829 }, { "epoch": 0.9210257408398618, "grad_norm": 0.4137403515798612, "learning_rate": 6.371062143934392e-07, "loss": 0.0356, "step": 11830 }, { "epoch": 0.9211035959320715, "grad_norm": 0.46170453438366493, "learning_rate": 6.358569123282099e-07, "loss": 0.0353, "step": 11831 }, { "epoch": 0.9211814510242811, "grad_norm": 0.5766117555000352, "learning_rate": 6.346088165580778e-07, "loss": 0.0476, "step": 11832 }, { "epoch": 0.9212593061164907, "grad_norm": 0.4145900286836541, "learning_rate": 6.333619271607894e-07, "loss": 0.0419, "step": 11833 }, { "epoch": 0.9213371612087004, "grad_norm": 0.3164394767696379, "learning_rate": 6.321162442140228e-07, "loss": 0.0205, "step": 11834 }, { "epoch": 0.9214150163009099, "grad_norm": 0.39175733786923567, "learning_rate": 6.308717677953757e-07, "loss": 0.0373, "step": 11835 }, { "epoch": 0.9214928713931195, "grad_norm": 0.40597380907206293, "learning_rate": 6.296284979823753e-07, "loss": 0.0306, "step": 11836 }, { "epoch": 0.9215707264853292, "grad_norm": 0.34172461084384537, "learning_rate": 6.283864348524704e-07, "loss": 0.0302, "step": 11837 }, { "epoch": 0.9216485815775388, "grad_norm": 0.39830618195999196, "learning_rate": 6.271455784830371e-07, "loss": 0.0306, "step": 11838 }, { "epoch": 0.9217264366697484, "grad_norm": 0.4296971864669805, "learning_rate": 6.259059289513758e-07, "loss": 0.0345, "step": 11839 }, { "epoch": 0.9218042917619581, "grad_norm": 0.46043321185293046, "learning_rate": 6.246674863347047e-07, "loss": 0.0342, "step": 11840 }, { "epoch": 0.9218821468541677, "grad_norm": 0.4719034759797885, "learning_rate": 6.234302507101775e-07, "loss": 0.0515, "step": 11841 }, { "epoch": 0.9219600019463773, "grad_norm": 0.4545717111615718, "learning_rate": 6.221942221548683e-07, "loss": 0.0431, "step": 11842 }, { "epoch": 0.922037857038587, "grad_norm": 0.4001057354099493, "learning_rate": 6.209594007457753e-07, "loss": 0.0383, "step": 11843 }, { "epoch": 0.9221157121307966, "grad_norm": 0.5157595909530077, "learning_rate": 6.197257865598239e-07, "loss": 0.0553, "step": 11844 }, { "epoch": 0.9221935672230062, "grad_norm": 0.5403847529470136, "learning_rate": 6.184933796738591e-07, "loss": 0.0364, "step": 11845 }, { "epoch": 0.9222714223152157, "grad_norm": 0.3350673695888122, "learning_rate": 6.17262180164655e-07, "loss": 0.0164, "step": 11846 }, { "epoch": 0.9223492774074254, "grad_norm": 0.4822171196100728, "learning_rate": 6.160321881089104e-07, "loss": 0.0331, "step": 11847 }, { "epoch": 0.922427132499635, "grad_norm": 0.4349882791339616, "learning_rate": 6.148034035832507e-07, "loss": 0.039, "step": 11848 }, { "epoch": 0.9225049875918446, "grad_norm": 0.48291471226536803, "learning_rate": 6.135758266642189e-07, "loss": 0.0494, "step": 11849 }, { "epoch": 0.9225828426840543, "grad_norm": 0.3218989501982691, "learning_rate": 6.123494574282874e-07, "loss": 0.018, "step": 11850 }, { "epoch": 0.9225828426840543, "eval_loss": 0.00487239845097065, "eval_runtime": 162.2446, "eval_samples_per_second": 17.751, "eval_steps_per_second": 0.635, "step": 11850 }, { "epoch": 0.9226606977762639, "grad_norm": 0.41596028740963215, "learning_rate": 6.111242959518549e-07, "loss": 0.0261, "step": 11851 }, { "epoch": 0.9227385528684735, "grad_norm": 0.5509834860082018, "learning_rate": 6.099003423112426e-07, "loss": 0.0524, "step": 11852 }, { "epoch": 0.9228164079606832, "grad_norm": 0.3257024865204534, "learning_rate": 6.086775965826986e-07, "loss": 0.0158, "step": 11853 }, { "epoch": 0.9228942630528928, "grad_norm": 0.4434183239064856, "learning_rate": 6.074560588423928e-07, "loss": 0.0359, "step": 11854 }, { "epoch": 0.9229721181451024, "grad_norm": 0.44724230567623163, "learning_rate": 6.062357291664223e-07, "loss": 0.0374, "step": 11855 }, { "epoch": 0.9230499732373121, "grad_norm": 0.44657799384207225, "learning_rate": 6.05016607630804e-07, "loss": 0.0373, "step": 11856 }, { "epoch": 0.9231278283295217, "grad_norm": 0.5597824999759673, "learning_rate": 6.037986943114926e-07, "loss": 0.0513, "step": 11857 }, { "epoch": 0.9232056834217313, "grad_norm": 0.450361710365005, "learning_rate": 6.025819892843476e-07, "loss": 0.0231, "step": 11858 }, { "epoch": 0.923283538513941, "grad_norm": 0.33709807258493935, "learning_rate": 6.013664926251705e-07, "loss": 0.025, "step": 11859 }, { "epoch": 0.9233613936061505, "grad_norm": 0.5388813312913605, "learning_rate": 6.001522044096785e-07, "loss": 0.0517, "step": 11860 }, { "epoch": 0.9234392486983601, "grad_norm": 0.505618077322438, "learning_rate": 5.989391247135157e-07, "loss": 0.0398, "step": 11861 }, { "epoch": 0.9235171037905698, "grad_norm": 0.33210708022332497, "learning_rate": 5.977272536122547e-07, "loss": 0.0216, "step": 11862 }, { "epoch": 0.9235949588827794, "grad_norm": 0.35307441799029293, "learning_rate": 5.965165911813864e-07, "loss": 0.024, "step": 11863 }, { "epoch": 0.923672813974989, "grad_norm": 0.3022865217281281, "learning_rate": 5.953071374963304e-07, "loss": 0.0187, "step": 11864 }, { "epoch": 0.9237506690671987, "grad_norm": 0.3750650929134852, "learning_rate": 5.940988926324287e-07, "loss": 0.0349, "step": 11865 }, { "epoch": 0.9238285241594083, "grad_norm": 0.437559628627131, "learning_rate": 5.9289185666495e-07, "loss": 0.0362, "step": 11866 }, { "epoch": 0.9239063792516179, "grad_norm": 0.5199994108658763, "learning_rate": 5.916860296690874e-07, "loss": 0.0631, "step": 11867 }, { "epoch": 0.9239842343438276, "grad_norm": 0.45103708037359547, "learning_rate": 5.904814117199586e-07, "loss": 0.0361, "step": 11868 }, { "epoch": 0.9240620894360372, "grad_norm": 0.4822622250011752, "learning_rate": 5.892780028926059e-07, "loss": 0.0391, "step": 11869 }, { "epoch": 0.9241399445282468, "grad_norm": 0.3343063703607486, "learning_rate": 5.880758032619938e-07, "loss": 0.0271, "step": 11870 }, { "epoch": 0.9242177996204565, "grad_norm": 0.5181401796653767, "learning_rate": 5.868748129030155e-07, "loss": 0.057, "step": 11871 }, { "epoch": 0.9242956547126661, "grad_norm": 0.39046801122683106, "learning_rate": 5.856750318904847e-07, "loss": 0.0324, "step": 11872 }, { "epoch": 0.9243735098048756, "grad_norm": 0.4764108352942403, "learning_rate": 5.844764602991459e-07, "loss": 0.0396, "step": 11873 }, { "epoch": 0.9244513648970853, "grad_norm": 0.4540241075679574, "learning_rate": 5.832790982036618e-07, "loss": 0.0353, "step": 11874 }, { "epoch": 0.9245292199892949, "grad_norm": 0.5070536250635647, "learning_rate": 5.820829456786237e-07, "loss": 0.0533, "step": 11875 }, { "epoch": 0.9246070750815045, "grad_norm": 0.29104531414402746, "learning_rate": 5.808880027985453e-07, "loss": 0.0192, "step": 11876 }, { "epoch": 0.9246849301737142, "grad_norm": 0.4317838290373104, "learning_rate": 5.796942696378672e-07, "loss": 0.0424, "step": 11877 }, { "epoch": 0.9247627852659238, "grad_norm": 0.5132117011008294, "learning_rate": 5.785017462709541e-07, "loss": 0.0475, "step": 11878 }, { "epoch": 0.9248406403581334, "grad_norm": 0.3771255416560225, "learning_rate": 5.773104327720913e-07, "loss": 0.0303, "step": 11879 }, { "epoch": 0.9249184954503431, "grad_norm": 0.354979374449205, "learning_rate": 5.761203292154948e-07, "loss": 0.0242, "step": 11880 }, { "epoch": 0.9249963505425527, "grad_norm": 0.44524665720156154, "learning_rate": 5.749314356752988e-07, "loss": 0.0342, "step": 11881 }, { "epoch": 0.9250742056347623, "grad_norm": 0.49454946580392123, "learning_rate": 5.737437522255685e-07, "loss": 0.0468, "step": 11882 }, { "epoch": 0.9251520607269719, "grad_norm": 0.33522726183775725, "learning_rate": 5.725572789402934e-07, "loss": 0.0255, "step": 11883 }, { "epoch": 0.9252299158191816, "grad_norm": 0.4264026002624005, "learning_rate": 5.713720158933833e-07, "loss": 0.0422, "step": 11884 }, { "epoch": 0.9253077709113912, "grad_norm": 0.5098091093539799, "learning_rate": 5.701879631586727e-07, "loss": 0.0446, "step": 11885 }, { "epoch": 0.9253856260036007, "grad_norm": 0.3231424595908914, "learning_rate": 5.690051208099246e-07, "loss": 0.0152, "step": 11886 }, { "epoch": 0.9254634810958104, "grad_norm": 0.4574065438832709, "learning_rate": 5.678234889208223e-07, "loss": 0.0416, "step": 11887 }, { "epoch": 0.92554133618802, "grad_norm": 0.42940845807720096, "learning_rate": 5.666430675649781e-07, "loss": 0.0318, "step": 11888 }, { "epoch": 0.9256191912802296, "grad_norm": 0.43072792420489625, "learning_rate": 5.654638568159265e-07, "loss": 0.0359, "step": 11889 }, { "epoch": 0.9256970463724393, "grad_norm": 0.5072824841286795, "learning_rate": 5.642858567471266e-07, "loss": 0.0463, "step": 11890 }, { "epoch": 0.9257749014646489, "grad_norm": 0.42785174871814735, "learning_rate": 5.631090674319617e-07, "loss": 0.0424, "step": 11891 }, { "epoch": 0.9258527565568585, "grad_norm": 0.48389168891820483, "learning_rate": 5.619334889437377e-07, "loss": 0.0425, "step": 11892 }, { "epoch": 0.9259306116490682, "grad_norm": 0.4014697866351352, "learning_rate": 5.607591213556917e-07, "loss": 0.0296, "step": 11893 }, { "epoch": 0.9260084667412778, "grad_norm": 0.3954887051014249, "learning_rate": 5.595859647409807e-07, "loss": 0.0363, "step": 11894 }, { "epoch": 0.9260863218334874, "grad_norm": 0.5040933296726443, "learning_rate": 5.584140191726861e-07, "loss": 0.0553, "step": 11895 }, { "epoch": 0.9261641769256971, "grad_norm": 0.46792288039786667, "learning_rate": 5.572432847238185e-07, "loss": 0.041, "step": 11896 }, { "epoch": 0.9262420320179067, "grad_norm": 0.40745511300982157, "learning_rate": 5.560737614672973e-07, "loss": 0.0313, "step": 11897 }, { "epoch": 0.9263198871101163, "grad_norm": 0.3412019113852342, "learning_rate": 5.54905449475991e-07, "loss": 0.0247, "step": 11898 }, { "epoch": 0.926397742202326, "grad_norm": 0.5661215301055822, "learning_rate": 5.537383488226744e-07, "loss": 0.0446, "step": 11899 }, { "epoch": 0.9264755972945355, "grad_norm": 0.4896068370935446, "learning_rate": 5.525724595800541e-07, "loss": 0.0457, "step": 11900 }, { "epoch": 0.9264755972945355, "eval_loss": 0.004855655133724213, "eval_runtime": 163.7083, "eval_samples_per_second": 17.592, "eval_steps_per_second": 0.629, "step": 11900 }, { "epoch": 0.9265534523867451, "grad_norm": 0.5328721331946272, "learning_rate": 5.514077818207586e-07, "loss": 0.0495, "step": 11901 }, { "epoch": 0.9266313074789548, "grad_norm": 0.4083330509810519, "learning_rate": 5.50244315617341e-07, "loss": 0.0331, "step": 11902 }, { "epoch": 0.9267091625711644, "grad_norm": 0.45578183747009904, "learning_rate": 5.490820610422787e-07, "loss": 0.0448, "step": 11903 }, { "epoch": 0.926787017663374, "grad_norm": 0.4260296595996489, "learning_rate": 5.479210181679784e-07, "loss": 0.0381, "step": 11904 }, { "epoch": 0.9268648727555837, "grad_norm": 0.4804742976743782, "learning_rate": 5.467611870667667e-07, "loss": 0.0367, "step": 11905 }, { "epoch": 0.9269427278477933, "grad_norm": 0.3992396235567929, "learning_rate": 5.456025678108945e-07, "loss": 0.0323, "step": 11906 }, { "epoch": 0.9270205829400029, "grad_norm": 0.41987298004989615, "learning_rate": 5.444451604725376e-07, "loss": 0.0334, "step": 11907 }, { "epoch": 0.9270984380322126, "grad_norm": 0.510345621882102, "learning_rate": 5.43288965123796e-07, "loss": 0.0582, "step": 11908 }, { "epoch": 0.9271762931244222, "grad_norm": 0.5405649953088442, "learning_rate": 5.42133981836701e-07, "loss": 0.0559, "step": 11909 }, { "epoch": 0.9272541482166318, "grad_norm": 0.5126227204948288, "learning_rate": 5.409802106831974e-07, "loss": 0.0474, "step": 11910 }, { "epoch": 0.9273320033088415, "grad_norm": 0.3029735986704101, "learning_rate": 5.398276517351608e-07, "loss": 0.0186, "step": 11911 }, { "epoch": 0.9274098584010511, "grad_norm": 0.579802357688765, "learning_rate": 5.386763050643917e-07, "loss": 0.0547, "step": 11912 }, { "epoch": 0.9274877134932606, "grad_norm": 0.3867984418534242, "learning_rate": 5.375261707426105e-07, "loss": 0.0283, "step": 11913 }, { "epoch": 0.9275655685854703, "grad_norm": 0.35503561101628195, "learning_rate": 5.363772488414709e-07, "loss": 0.022, "step": 11914 }, { "epoch": 0.9276434236776799, "grad_norm": 0.4392429739247158, "learning_rate": 5.352295394325402e-07, "loss": 0.0304, "step": 11915 }, { "epoch": 0.9277212787698895, "grad_norm": 0.3749060810932913, "learning_rate": 5.340830425873144e-07, "loss": 0.0226, "step": 11916 }, { "epoch": 0.9277991338620991, "grad_norm": 0.5355715600376758, "learning_rate": 5.329377583772188e-07, "loss": 0.0374, "step": 11917 }, { "epoch": 0.9278769889543088, "grad_norm": 0.47255725179585, "learning_rate": 5.317936868735962e-07, "loss": 0.0396, "step": 11918 }, { "epoch": 0.9279548440465184, "grad_norm": 0.39987294408361285, "learning_rate": 5.306508281477185e-07, "loss": 0.0271, "step": 11919 }, { "epoch": 0.928032699138728, "grad_norm": 0.5215586463086538, "learning_rate": 5.295091822707821e-07, "loss": 0.0465, "step": 11920 }, { "epoch": 0.9281105542309377, "grad_norm": 0.4884750387548025, "learning_rate": 5.283687493139033e-07, "loss": 0.0459, "step": 11921 }, { "epoch": 0.9281884093231473, "grad_norm": 0.40682569580721967, "learning_rate": 5.272295293481255e-07, "loss": 0.0359, "step": 11922 }, { "epoch": 0.9282662644153569, "grad_norm": 0.4413030864172427, "learning_rate": 5.260915224444207e-07, "loss": 0.0413, "step": 11923 }, { "epoch": 0.9283441195075666, "grad_norm": 0.4248067124213743, "learning_rate": 5.249547286736767e-07, "loss": 0.0349, "step": 11924 }, { "epoch": 0.9284219745997762, "grad_norm": 0.3422462296175405, "learning_rate": 5.238191481067124e-07, "loss": 0.0212, "step": 11925 }, { "epoch": 0.9284998296919857, "grad_norm": 0.5010017859328506, "learning_rate": 5.226847808142688e-07, "loss": 0.0452, "step": 11926 }, { "epoch": 0.9285776847841954, "grad_norm": 0.46943464340039226, "learning_rate": 5.215516268670118e-07, "loss": 0.0413, "step": 11927 }, { "epoch": 0.928655539876405, "grad_norm": 0.4412983481652536, "learning_rate": 5.204196863355315e-07, "loss": 0.0429, "step": 11928 }, { "epoch": 0.9287333949686146, "grad_norm": 0.518934858775774, "learning_rate": 5.192889592903427e-07, "loss": 0.0505, "step": 11929 }, { "epoch": 0.9288112500608243, "grad_norm": 0.3641670680122675, "learning_rate": 5.181594458018846e-07, "loss": 0.0261, "step": 11930 }, { "epoch": 0.9288891051530339, "grad_norm": 0.590807330445309, "learning_rate": 5.170311459405208e-07, "loss": 0.0678, "step": 11931 }, { "epoch": 0.9289669602452435, "grad_norm": 0.5389296474612952, "learning_rate": 5.159040597765397e-07, "loss": 0.0467, "step": 11932 }, { "epoch": 0.9290448153374532, "grad_norm": 0.47397770987385573, "learning_rate": 5.147781873801494e-07, "loss": 0.0437, "step": 11933 }, { "epoch": 0.9291226704296628, "grad_norm": 0.44635463633337574, "learning_rate": 5.136535288214917e-07, "loss": 0.0305, "step": 11934 }, { "epoch": 0.9292005255218724, "grad_norm": 0.3710607814008647, "learning_rate": 5.125300841706238e-07, "loss": 0.0268, "step": 11935 }, { "epoch": 0.9292783806140821, "grad_norm": 0.5428601273911446, "learning_rate": 5.114078534975342e-07, "loss": 0.0544, "step": 11936 }, { "epoch": 0.9293562357062917, "grad_norm": 0.44269589745250576, "learning_rate": 5.102868368721292e-07, "loss": 0.0289, "step": 11937 }, { "epoch": 0.9294340907985013, "grad_norm": 0.484031080483142, "learning_rate": 5.091670343642441e-07, "loss": 0.0372, "step": 11938 }, { "epoch": 0.929511945890711, "grad_norm": 0.3829127956742078, "learning_rate": 5.080484460436385e-07, "loss": 0.0256, "step": 11939 }, { "epoch": 0.9295898009829205, "grad_norm": 0.3345503060968145, "learning_rate": 5.069310719799947e-07, "loss": 0.0161, "step": 11940 }, { "epoch": 0.9296676560751301, "grad_norm": 0.41866944306048287, "learning_rate": 5.05814912242919e-07, "loss": 0.0378, "step": 11941 }, { "epoch": 0.9297455111673398, "grad_norm": 0.4862299626915589, "learning_rate": 5.046999669019448e-07, "loss": 0.0512, "step": 11942 }, { "epoch": 0.9298233662595494, "grad_norm": 0.4413890911927051, "learning_rate": 5.035862360265253e-07, "loss": 0.0361, "step": 11943 }, { "epoch": 0.929901221351759, "grad_norm": 0.46684837116491207, "learning_rate": 5.02473719686043e-07, "loss": 0.043, "step": 11944 }, { "epoch": 0.9299790764439687, "grad_norm": 0.4072773406765476, "learning_rate": 5.013624179498022e-07, "loss": 0.0355, "step": 11945 }, { "epoch": 0.9300569315361783, "grad_norm": 0.43916715604644485, "learning_rate": 5.002523308870299e-07, "loss": 0.0353, "step": 11946 }, { "epoch": 0.9301347866283879, "grad_norm": 0.41843998359842005, "learning_rate": 4.991434585668798e-07, "loss": 0.0368, "step": 11947 }, { "epoch": 0.9302126417205976, "grad_norm": 0.44175285076348464, "learning_rate": 4.980358010584318e-07, "loss": 0.043, "step": 11948 }, { "epoch": 0.9302904968128072, "grad_norm": 0.42617638288893955, "learning_rate": 4.969293584306844e-07, "loss": 0.0287, "step": 11949 }, { "epoch": 0.9303683519050168, "grad_norm": 0.4966661082784912, "learning_rate": 4.958241307525647e-07, "loss": 0.0407, "step": 11950 }, { "epoch": 0.9303683519050168, "eval_loss": 0.004839424043893814, "eval_runtime": 162.7138, "eval_samples_per_second": 17.7, "eval_steps_per_second": 0.633, "step": 11950 }, { "epoch": 0.9304462069972265, "grad_norm": 0.36227171952878856, "learning_rate": 4.947201180929262e-07, "loss": 0.0264, "step": 11951 }, { "epoch": 0.930524062089436, "grad_norm": 0.4079375060629342, "learning_rate": 4.936173205205408e-07, "loss": 0.0334, "step": 11952 }, { "epoch": 0.9306019171816456, "grad_norm": 0.4107488413911963, "learning_rate": 4.92515738104109e-07, "loss": 0.0431, "step": 11953 }, { "epoch": 0.9306797722738552, "grad_norm": 0.46377337463615187, "learning_rate": 4.914153709122515e-07, "loss": 0.0363, "step": 11954 }, { "epoch": 0.9307576273660649, "grad_norm": 0.5392053077289568, "learning_rate": 4.903162190135202e-07, "loss": 0.065, "step": 11955 }, { "epoch": 0.9308354824582745, "grad_norm": 0.282793668460629, "learning_rate": 4.892182824763825e-07, "loss": 0.0117, "step": 11956 }, { "epoch": 0.9309133375504841, "grad_norm": 0.3266684904280781, "learning_rate": 4.881215613692391e-07, "loss": 0.0203, "step": 11957 }, { "epoch": 0.9309911926426938, "grad_norm": 0.26249336091994974, "learning_rate": 4.870260557604067e-07, "loss": 0.012, "step": 11958 }, { "epoch": 0.9310690477349034, "grad_norm": 0.3966889030317844, "learning_rate": 4.859317657181329e-07, "loss": 0.0258, "step": 11959 }, { "epoch": 0.931146902827113, "grad_norm": 0.3626110979904535, "learning_rate": 4.848386913105851e-07, "loss": 0.0255, "step": 11960 }, { "epoch": 0.9312247579193227, "grad_norm": 0.27751290771215176, "learning_rate": 4.837468326058581e-07, "loss": 0.0134, "step": 11961 }, { "epoch": 0.9313026130115323, "grad_norm": 0.4351200795348884, "learning_rate": 4.826561896719683e-07, "loss": 0.0375, "step": 11962 }, { "epoch": 0.9313804681037419, "grad_norm": 0.37750090967399286, "learning_rate": 4.815667625768616e-07, "loss": 0.0309, "step": 11963 }, { "epoch": 0.9314583231959516, "grad_norm": 0.44622634336795214, "learning_rate": 4.804785513883969e-07, "loss": 0.0418, "step": 11964 }, { "epoch": 0.9315361782881612, "grad_norm": 0.37275997457364823, "learning_rate": 4.793915561743711e-07, "loss": 0.0324, "step": 11965 }, { "epoch": 0.9316140333803707, "grad_norm": 0.5308482734001982, "learning_rate": 4.783057770024945e-07, "loss": 0.0513, "step": 11966 }, { "epoch": 0.9316918884725804, "grad_norm": 0.42800954883115305, "learning_rate": 4.772212139404086e-07, "loss": 0.0368, "step": 11967 }, { "epoch": 0.93176974356479, "grad_norm": 0.4105112657405206, "learning_rate": 4.761378670556771e-07, "loss": 0.0372, "step": 11968 }, { "epoch": 0.9318475986569996, "grad_norm": 0.42469374654832454, "learning_rate": 4.750557364157815e-07, "loss": 0.0383, "step": 11969 }, { "epoch": 0.9319254537492093, "grad_norm": 0.4772268128823981, "learning_rate": 4.739748220881435e-07, "loss": 0.0419, "step": 11970 }, { "epoch": 0.9320033088414189, "grad_norm": 0.43280341067755446, "learning_rate": 4.728951241400914e-07, "loss": 0.0306, "step": 11971 }, { "epoch": 0.9320811639336285, "grad_norm": 0.36670934999871174, "learning_rate": 4.7181664263888926e-07, "loss": 0.0274, "step": 11972 }, { "epoch": 0.9321590190258382, "grad_norm": 0.5582803731982992, "learning_rate": 4.7073937765171886e-07, "loss": 0.0639, "step": 11973 }, { "epoch": 0.9322368741180478, "grad_norm": 0.38021584767619127, "learning_rate": 4.6966332924568867e-07, "loss": 0.0215, "step": 11974 }, { "epoch": 0.9323147292102574, "grad_norm": 0.4140397517557721, "learning_rate": 4.685884974878319e-07, "loss": 0.0357, "step": 11975 }, { "epoch": 0.9323925843024671, "grad_norm": 0.35140297556250344, "learning_rate": 4.675148824451081e-07, "loss": 0.0234, "step": 11976 }, { "epoch": 0.9324704393946767, "grad_norm": 0.5511205035245698, "learning_rate": 4.664424841843951e-07, "loss": 0.0565, "step": 11977 }, { "epoch": 0.9325482944868863, "grad_norm": 0.4556048732956774, "learning_rate": 4.653713027724993e-07, "loss": 0.0418, "step": 11978 }, { "epoch": 0.932626149579096, "grad_norm": 0.42930996034255875, "learning_rate": 4.643013382761519e-07, "loss": 0.0383, "step": 11979 }, { "epoch": 0.9327040046713055, "grad_norm": 0.4286106803192099, "learning_rate": 4.6323259076200166e-07, "loss": 0.0283, "step": 11980 }, { "epoch": 0.9327818597635151, "grad_norm": 0.4126589968607331, "learning_rate": 4.6216506029663543e-07, "loss": 0.0195, "step": 11981 }, { "epoch": 0.9328597148557248, "grad_norm": 0.5120041456311525, "learning_rate": 4.6109874694654664e-07, "loss": 0.0474, "step": 11982 }, { "epoch": 0.9329375699479344, "grad_norm": 0.511039917164668, "learning_rate": 4.6003365077816666e-07, "loss": 0.0467, "step": 11983 }, { "epoch": 0.933015425040144, "grad_norm": 0.48012974818682297, "learning_rate": 4.589697718578423e-07, "loss": 0.0409, "step": 11984 }, { "epoch": 0.9330932801323537, "grad_norm": 0.4517161971728789, "learning_rate": 4.579071102518495e-07, "loss": 0.0398, "step": 11985 }, { "epoch": 0.9331711352245633, "grad_norm": 0.3647933656506455, "learning_rate": 4.568456660263887e-07, "loss": 0.0285, "step": 11986 }, { "epoch": 0.9332489903167729, "grad_norm": 0.37538892633059484, "learning_rate": 4.5578543924758467e-07, "loss": 0.0291, "step": 11987 }, { "epoch": 0.9333268454089826, "grad_norm": 0.32621055510219543, "learning_rate": 4.54726429981478e-07, "loss": 0.0221, "step": 11988 }, { "epoch": 0.9334047005011922, "grad_norm": 0.46949491395946713, "learning_rate": 4.5366863829404695e-07, "loss": 0.0431, "step": 11989 }, { "epoch": 0.9334825555934018, "grad_norm": 0.3997682958922665, "learning_rate": 4.5261206425118333e-07, "loss": 0.0316, "step": 11990 }, { "epoch": 0.9335604106856114, "grad_norm": 0.47687486156470443, "learning_rate": 4.5155670791870556e-07, "loss": 0.0426, "step": 11991 }, { "epoch": 0.933638265777821, "grad_norm": 0.44423924174192136, "learning_rate": 4.5050256936236103e-07, "loss": 0.0357, "step": 11992 }, { "epoch": 0.9337161208700306, "grad_norm": 0.48027250782227915, "learning_rate": 4.49449648647815e-07, "loss": 0.0381, "step": 11993 }, { "epoch": 0.9337939759622402, "grad_norm": 0.44694916685770286, "learning_rate": 4.4839794584065953e-07, "loss": 0.0337, "step": 11994 }, { "epoch": 0.9338718310544499, "grad_norm": 0.4916836072694899, "learning_rate": 4.473474610064088e-07, "loss": 0.0488, "step": 11995 }, { "epoch": 0.9339496861466595, "grad_norm": 0.43689266690168277, "learning_rate": 4.4629819421050826e-07, "loss": 0.0368, "step": 11996 }, { "epoch": 0.9340275412388691, "grad_norm": 0.37494822002318196, "learning_rate": 4.4525014551832116e-07, "loss": 0.0292, "step": 11997 }, { "epoch": 0.9341053963310788, "grad_norm": 0.45445556264843734, "learning_rate": 4.4420331499513303e-07, "loss": 0.0376, "step": 11998 }, { "epoch": 0.9341832514232884, "grad_norm": 0.41249667326937356, "learning_rate": 4.4315770270615843e-07, "loss": 0.0349, "step": 11999 }, { "epoch": 0.934261106515498, "grad_norm": 0.41626141531436484, "learning_rate": 4.4211330871653414e-07, "loss": 0.0335, "step": 12000 }, { "epoch": 0.934261106515498, "eval_loss": 0.004831336904317141, "eval_runtime": 163.2382, "eval_samples_per_second": 17.643, "eval_steps_per_second": 0.631, "step": 12000 }, { "epoch": 0.9343389616077077, "grad_norm": 0.5508354136291982, "learning_rate": 4.410701330913214e-07, "loss": 0.0505, "step": 12001 }, { "epoch": 0.9344168166999173, "grad_norm": 0.33055569280984687, "learning_rate": 4.400281758955016e-07, "loss": 0.0253, "step": 12002 }, { "epoch": 0.9344946717921269, "grad_norm": 0.3782978300737579, "learning_rate": 4.389874371939873e-07, "loss": 0.0287, "step": 12003 }, { "epoch": 0.9345725268843366, "grad_norm": 0.3910912359703023, "learning_rate": 4.379479170516132e-07, "loss": 0.0331, "step": 12004 }, { "epoch": 0.9346503819765462, "grad_norm": 0.46102952944710174, "learning_rate": 4.3690961553312984e-07, "loss": 0.0464, "step": 12005 }, { "epoch": 0.9347282370687557, "grad_norm": 0.32211872715569395, "learning_rate": 4.3587253270322536e-07, "loss": 0.0203, "step": 12006 }, { "epoch": 0.9348060921609654, "grad_norm": 0.38349884662981854, "learning_rate": 4.3483666862650154e-07, "loss": 0.0252, "step": 12007 }, { "epoch": 0.934883947253175, "grad_norm": 0.39610182208546574, "learning_rate": 4.3380202336749113e-07, "loss": 0.0323, "step": 12008 }, { "epoch": 0.9349618023453846, "grad_norm": 0.3825591658083578, "learning_rate": 4.3276859699064253e-07, "loss": 0.0238, "step": 12009 }, { "epoch": 0.9350396574375943, "grad_norm": 0.46520987548688897, "learning_rate": 4.317363895603377e-07, "loss": 0.0397, "step": 12010 }, { "epoch": 0.9351175125298039, "grad_norm": 0.41153526482265873, "learning_rate": 4.307054011408762e-07, "loss": 0.0264, "step": 12011 }, { "epoch": 0.9351953676220135, "grad_norm": 0.49470383564285814, "learning_rate": 4.2967563179648673e-07, "loss": 0.044, "step": 12012 }, { "epoch": 0.9352732227142232, "grad_norm": 0.38222557438816884, "learning_rate": 4.286470815913135e-07, "loss": 0.0213, "step": 12013 }, { "epoch": 0.9353510778064328, "grad_norm": 0.4053094684296458, "learning_rate": 4.2761975058943636e-07, "loss": 0.0272, "step": 12014 }, { "epoch": 0.9354289328986424, "grad_norm": 0.5183799014780448, "learning_rate": 4.265936388548464e-07, "loss": 0.0522, "step": 12015 }, { "epoch": 0.9355067879908521, "grad_norm": 0.4930901352472437, "learning_rate": 4.255687464514724e-07, "loss": 0.0341, "step": 12016 }, { "epoch": 0.9355846430830617, "grad_norm": 0.34329552975849886, "learning_rate": 4.245450734431589e-07, "loss": 0.0281, "step": 12017 }, { "epoch": 0.9356624981752713, "grad_norm": 0.5431134220073423, "learning_rate": 4.235226198936726e-07, "loss": 0.0525, "step": 12018 }, { "epoch": 0.935740353267481, "grad_norm": 0.4060184123471255, "learning_rate": 4.225013858667115e-07, "loss": 0.0353, "step": 12019 }, { "epoch": 0.9358182083596905, "grad_norm": 0.41901663053618177, "learning_rate": 4.214813714258892e-07, "loss": 0.0387, "step": 12020 }, { "epoch": 0.9358960634519001, "grad_norm": 0.42980126279666325, "learning_rate": 4.2046257663475254e-07, "loss": 0.0394, "step": 12021 }, { "epoch": 0.9359739185441098, "grad_norm": 0.42086315834216176, "learning_rate": 4.1944500155676416e-07, "loss": 0.0338, "step": 12022 }, { "epoch": 0.9360517736363194, "grad_norm": 0.4233630504086921, "learning_rate": 4.1842864625531556e-07, "loss": 0.0273, "step": 12023 }, { "epoch": 0.936129628728529, "grad_norm": 0.37148183972631743, "learning_rate": 4.174135107937205e-07, "loss": 0.0261, "step": 12024 }, { "epoch": 0.9362074838207386, "grad_norm": 0.33946987586766014, "learning_rate": 4.1639959523521735e-07, "loss": 0.0199, "step": 12025 }, { "epoch": 0.9362853389129483, "grad_norm": 0.517004964925028, "learning_rate": 4.1538689964296666e-07, "loss": 0.0591, "step": 12026 }, { "epoch": 0.9363631940051579, "grad_norm": 0.38651871184247116, "learning_rate": 4.143754240800579e-07, "loss": 0.0275, "step": 12027 }, { "epoch": 0.9364410490973675, "grad_norm": 0.4265800365463928, "learning_rate": 4.133651686095008e-07, "loss": 0.0288, "step": 12028 }, { "epoch": 0.9365189041895772, "grad_norm": 0.433253149418201, "learning_rate": 4.123561332942272e-07, "loss": 0.0349, "step": 12029 }, { "epoch": 0.9365967592817868, "grad_norm": 0.43439794558212436, "learning_rate": 4.113483181971001e-07, "loss": 0.0427, "step": 12030 }, { "epoch": 0.9366746143739964, "grad_norm": 0.4196199223507759, "learning_rate": 4.1034172338089373e-07, "loss": 0.0231, "step": 12031 }, { "epoch": 0.936752469466206, "grad_norm": 0.5848143253099349, "learning_rate": 4.09336348908318e-07, "loss": 0.0743, "step": 12032 }, { "epoch": 0.9368303245584156, "grad_norm": 0.5078994310868117, "learning_rate": 4.0833219484200493e-07, "loss": 0.0435, "step": 12033 }, { "epoch": 0.9369081796506252, "grad_norm": 0.36124325226415055, "learning_rate": 4.07329261244509e-07, "loss": 0.022, "step": 12034 }, { "epoch": 0.9369860347428349, "grad_norm": 0.4231858923610602, "learning_rate": 4.063275481783047e-07, "loss": 0.0377, "step": 12035 }, { "epoch": 0.9370638898350445, "grad_norm": 0.49203763822781554, "learning_rate": 4.053270557057931e-07, "loss": 0.0426, "step": 12036 }, { "epoch": 0.9371417449272541, "grad_norm": 0.4713430364014913, "learning_rate": 4.0432778388930447e-07, "loss": 0.0468, "step": 12037 }, { "epoch": 0.9372196000194638, "grad_norm": 0.3603690819196941, "learning_rate": 4.03329732791089e-07, "loss": 0.0267, "step": 12038 }, { "epoch": 0.9372974551116734, "grad_norm": 0.3703579681303301, "learning_rate": 4.023329024733191e-07, "loss": 0.0273, "step": 12039 }, { "epoch": 0.937375310203883, "grad_norm": 0.35072368359974254, "learning_rate": 4.013372929980919e-07, "loss": 0.0202, "step": 12040 }, { "epoch": 0.9374531652960927, "grad_norm": 0.37595388959912324, "learning_rate": 4.0034290442742653e-07, "loss": 0.029, "step": 12041 }, { "epoch": 0.9375310203883023, "grad_norm": 0.4005345652571356, "learning_rate": 3.993497368232735e-07, "loss": 0.0323, "step": 12042 }, { "epoch": 0.9376088754805119, "grad_norm": 0.47812438477278646, "learning_rate": 3.983577902475011e-07, "loss": 0.048, "step": 12043 }, { "epoch": 0.9376867305727216, "grad_norm": 0.48750499611730685, "learning_rate": 3.973670647619021e-07, "loss": 0.0404, "step": 12044 }, { "epoch": 0.9377645856649311, "grad_norm": 0.3921918422473736, "learning_rate": 3.9637756042819384e-07, "loss": 0.0254, "step": 12045 }, { "epoch": 0.9378424407571407, "grad_norm": 0.4234587357989972, "learning_rate": 3.9538927730801815e-07, "loss": 0.029, "step": 12046 }, { "epoch": 0.9379202958493504, "grad_norm": 0.4611444934621677, "learning_rate": 3.944022154629412e-07, "loss": 0.0371, "step": 12047 }, { "epoch": 0.93799815094156, "grad_norm": 0.5051956302031009, "learning_rate": 3.9341637495445394e-07, "loss": 0.0528, "step": 12048 }, { "epoch": 0.9380760060337696, "grad_norm": 0.32377735333536667, "learning_rate": 3.924317558439672e-07, "loss": 0.0202, "step": 12049 }, { "epoch": 0.9381538611259793, "grad_norm": 0.3237754660132156, "learning_rate": 3.9144835819281636e-07, "loss": 0.0251, "step": 12050 }, { "epoch": 0.9381538611259793, "eval_loss": 0.004823906347155571, "eval_runtime": 162.4785, "eval_samples_per_second": 17.725, "eval_steps_per_second": 0.634, "step": 12050 }, { "epoch": 0.9382317162181889, "grad_norm": 0.356282522976407, "learning_rate": 3.904661820622635e-07, "loss": 0.0168, "step": 12051 }, { "epoch": 0.9383095713103985, "grad_norm": 0.38692243623421313, "learning_rate": 3.894852275134953e-07, "loss": 0.0328, "step": 12052 }, { "epoch": 0.9383874264026082, "grad_norm": 0.4281966262083891, "learning_rate": 3.8850549460762056e-07, "loss": 0.0478, "step": 12053 }, { "epoch": 0.9384652814948178, "grad_norm": 0.48523411130529864, "learning_rate": 3.8752698340567276e-07, "loss": 0.0436, "step": 12054 }, { "epoch": 0.9385431365870274, "grad_norm": 0.48182595079987806, "learning_rate": 3.865496939686053e-07, "loss": 0.0393, "step": 12055 }, { "epoch": 0.9386209916792371, "grad_norm": 0.4486496750333468, "learning_rate": 3.855736263573007e-07, "loss": 0.0349, "step": 12056 }, { "epoch": 0.9386988467714467, "grad_norm": 0.5709241927691391, "learning_rate": 3.8459878063256353e-07, "loss": 0.0495, "step": 12057 }, { "epoch": 0.9387767018636562, "grad_norm": 0.4265267407131086, "learning_rate": 3.83625156855123e-07, "loss": 0.031, "step": 12058 }, { "epoch": 0.938854556955866, "grad_norm": 0.4707483865893939, "learning_rate": 3.826527550856285e-07, "loss": 0.0347, "step": 12059 }, { "epoch": 0.9389324120480755, "grad_norm": 0.45235577237592706, "learning_rate": 3.816815753846581e-07, "loss": 0.0504, "step": 12060 }, { "epoch": 0.9390102671402851, "grad_norm": 0.3575550392461377, "learning_rate": 3.8071161781271235e-07, "loss": 0.0272, "step": 12061 }, { "epoch": 0.9390881222324947, "grad_norm": 0.5387283153765584, "learning_rate": 3.7974288243020963e-07, "loss": 0.0407, "step": 12062 }, { "epoch": 0.9391659773247044, "grad_norm": 0.4430028187196177, "learning_rate": 3.78775369297506e-07, "loss": 0.0351, "step": 12063 }, { "epoch": 0.939243832416914, "grad_norm": 0.43855790509353954, "learning_rate": 3.778090784748689e-07, "loss": 0.0262, "step": 12064 }, { "epoch": 0.9393216875091236, "grad_norm": 0.3325876082016258, "learning_rate": 3.768440100224924e-07, "loss": 0.021, "step": 12065 }, { "epoch": 0.9393995426013333, "grad_norm": 0.37767634948650836, "learning_rate": 3.758801640004972e-07, "loss": 0.0267, "step": 12066 }, { "epoch": 0.9394773976935429, "grad_norm": 0.3613722315724098, "learning_rate": 3.749175404689287e-07, "loss": 0.0378, "step": 12067 }, { "epoch": 0.9395552527857525, "grad_norm": 0.5004986618108725, "learning_rate": 3.739561394877478e-07, "loss": 0.0397, "step": 12068 }, { "epoch": 0.9396331078779622, "grad_norm": 0.3725091248125636, "learning_rate": 3.72995961116851e-07, "loss": 0.0261, "step": 12069 }, { "epoch": 0.9397109629701718, "grad_norm": 0.3939121255816057, "learning_rate": 3.720370054160505e-07, "loss": 0.0345, "step": 12070 }, { "epoch": 0.9397888180623813, "grad_norm": 0.42539685808545963, "learning_rate": 3.710792724450851e-07, "loss": 0.0343, "step": 12071 }, { "epoch": 0.939866673154591, "grad_norm": 0.5118824806081235, "learning_rate": 3.701227622636161e-07, "loss": 0.0362, "step": 12072 }, { "epoch": 0.9399445282468006, "grad_norm": 0.36467130517008156, "learning_rate": 3.6916747493123126e-07, "loss": 0.0296, "step": 12073 }, { "epoch": 0.9400223833390102, "grad_norm": 0.4464171970059804, "learning_rate": 3.6821341050743867e-07, "loss": 0.0376, "step": 12074 }, { "epoch": 0.9401002384312199, "grad_norm": 0.4260550734901457, "learning_rate": 3.6726056905167507e-07, "loss": 0.0266, "step": 12075 }, { "epoch": 0.9401780935234295, "grad_norm": 0.437552858443786, "learning_rate": 3.6630895062329306e-07, "loss": 0.0344, "step": 12076 }, { "epoch": 0.9402559486156391, "grad_norm": 0.4625112194716922, "learning_rate": 3.6535855528158084e-07, "loss": 0.0414, "step": 12077 }, { "epoch": 0.9403338037078488, "grad_norm": 0.3731904101102833, "learning_rate": 3.644093830857376e-07, "loss": 0.0247, "step": 12078 }, { "epoch": 0.9404116588000584, "grad_norm": 0.37846891571123753, "learning_rate": 3.6346143409489386e-07, "loss": 0.0325, "step": 12079 }, { "epoch": 0.940489513892268, "grad_norm": 0.43323918249378573, "learning_rate": 3.6251470836810243e-07, "loss": 0.0292, "step": 12080 }, { "epoch": 0.9405673689844777, "grad_norm": 0.46304962559820917, "learning_rate": 3.6156920596434055e-07, "loss": 0.0387, "step": 12081 }, { "epoch": 0.9406452240766873, "grad_norm": 0.383971085758807, "learning_rate": 3.6062492694250773e-07, "loss": 0.0221, "step": 12082 }, { "epoch": 0.9407230791688969, "grad_norm": 0.6183537995662068, "learning_rate": 3.596818713614303e-07, "loss": 0.063, "step": 12083 }, { "epoch": 0.9408009342611066, "grad_norm": 0.42463316124466116, "learning_rate": 3.587400392798546e-07, "loss": 0.0282, "step": 12084 }, { "epoch": 0.9408787893533161, "grad_norm": 0.4587491600320837, "learning_rate": 3.577994307564536e-07, "loss": 0.0322, "step": 12085 }, { "epoch": 0.9409566444455257, "grad_norm": 0.5088475756784148, "learning_rate": 3.5686004584981835e-07, "loss": 0.0455, "step": 12086 }, { "epoch": 0.9410344995377354, "grad_norm": 0.38875618154658603, "learning_rate": 3.559218846184731e-07, "loss": 0.025, "step": 12087 }, { "epoch": 0.941112354629945, "grad_norm": 0.4871033364408073, "learning_rate": 3.5498494712085997e-07, "loss": 0.04, "step": 12088 }, { "epoch": 0.9411902097221546, "grad_norm": 0.6589453913244868, "learning_rate": 3.5404923341534334e-07, "loss": 0.0612, "step": 12089 }, { "epoch": 0.9412680648143643, "grad_norm": 0.6453996599475281, "learning_rate": 3.531147435602167e-07, "loss": 0.0713, "step": 12090 }, { "epoch": 0.9413459199065739, "grad_norm": 0.3974523501753267, "learning_rate": 3.5218147761369335e-07, "loss": 0.0313, "step": 12091 }, { "epoch": 0.9414237749987835, "grad_norm": 0.46569982940954535, "learning_rate": 3.512494356339069e-07, "loss": 0.0495, "step": 12092 }, { "epoch": 0.9415016300909932, "grad_norm": 0.4712984267073502, "learning_rate": 3.503186176789264e-07, "loss": 0.037, "step": 12093 }, { "epoch": 0.9415794851832028, "grad_norm": 0.46593228780682616, "learning_rate": 3.4938902380673656e-07, "loss": 0.0469, "step": 12094 }, { "epoch": 0.9416573402754124, "grad_norm": 0.4806642759095777, "learning_rate": 3.4846065407524223e-07, "loss": 0.038, "step": 12095 }, { "epoch": 0.9417351953676221, "grad_norm": 0.390313577960709, "learning_rate": 3.4753350854227707e-07, "loss": 0.0299, "step": 12096 }, { "epoch": 0.9418130504598317, "grad_norm": 0.3851868489415082, "learning_rate": 3.4660758726560384e-07, "loss": 0.0286, "step": 12097 }, { "epoch": 0.9418909055520412, "grad_norm": 0.3910590592074146, "learning_rate": 3.456828903028941e-07, "loss": 0.02, "step": 12098 }, { "epoch": 0.9419687606442508, "grad_norm": 0.2778875458649593, "learning_rate": 3.447594177117597e-07, "loss": 0.0118, "step": 12099 }, { "epoch": 0.9420466157364605, "grad_norm": 0.4115087803763865, "learning_rate": 3.438371695497256e-07, "loss": 0.0298, "step": 12100 }, { "epoch": 0.9420466157364605, "eval_loss": 0.004810561891645193, "eval_runtime": 163.2581, "eval_samples_per_second": 17.641, "eval_steps_per_second": 0.631, "step": 12100 }, { "epoch": 0.9421244708286701, "grad_norm": 0.507638279814983, "learning_rate": 3.429161458742436e-07, "loss": 0.0401, "step": 12101 }, { "epoch": 0.9422023259208797, "grad_norm": 0.47019963068413123, "learning_rate": 3.4199634674269013e-07, "loss": 0.0337, "step": 12102 }, { "epoch": 0.9422801810130894, "grad_norm": 0.4892699170802774, "learning_rate": 3.4107777221235926e-07, "loss": 0.039, "step": 12103 }, { "epoch": 0.942358036105299, "grad_norm": 0.4300034159128509, "learning_rate": 3.4016042234048087e-07, "loss": 0.0314, "step": 12104 }, { "epoch": 0.9424358911975086, "grad_norm": 0.4023274624312852, "learning_rate": 3.3924429718420027e-07, "loss": 0.0305, "step": 12105 }, { "epoch": 0.9425137462897183, "grad_norm": 0.5783431667391552, "learning_rate": 3.383293968005874e-07, "loss": 0.0554, "step": 12106 }, { "epoch": 0.9425916013819279, "grad_norm": 0.3593798542798782, "learning_rate": 3.3741572124663223e-07, "loss": 0.0235, "step": 12107 }, { "epoch": 0.9426694564741375, "grad_norm": 0.4714428351443683, "learning_rate": 3.365032705792537e-07, "loss": 0.0445, "step": 12108 }, { "epoch": 0.9427473115663472, "grad_norm": 0.5499121897292162, "learning_rate": 3.355920448552974e-07, "loss": 0.059, "step": 12109 }, { "epoch": 0.9428251666585568, "grad_norm": 0.35759778243747514, "learning_rate": 3.346820441315246e-07, "loss": 0.0273, "step": 12110 }, { "epoch": 0.9429030217507663, "grad_norm": 0.4735837824222747, "learning_rate": 3.3377326846462775e-07, "loss": 0.0422, "step": 12111 }, { "epoch": 0.942980876842976, "grad_norm": 0.432639059234336, "learning_rate": 3.3286571791121493e-07, "loss": 0.0316, "step": 12112 }, { "epoch": 0.9430587319351856, "grad_norm": 0.38259967705443537, "learning_rate": 3.3195939252782306e-07, "loss": 0.0243, "step": 12113 }, { "epoch": 0.9431365870273952, "grad_norm": 0.4190976983795458, "learning_rate": 3.310542923709137e-07, "loss": 0.0299, "step": 12114 }, { "epoch": 0.9432144421196049, "grad_norm": 0.43727813588581743, "learning_rate": 3.301504174968728e-07, "loss": 0.028, "step": 12115 }, { "epoch": 0.9432922972118145, "grad_norm": 0.33035134662916227, "learning_rate": 3.2924776796200207e-07, "loss": 0.0203, "step": 12116 }, { "epoch": 0.9433701523040241, "grad_norm": 0.385271661415688, "learning_rate": 3.283463438225343e-07, "loss": 0.0365, "step": 12117 }, { "epoch": 0.9434480073962338, "grad_norm": 0.37558222203720576, "learning_rate": 3.2744614513462226e-07, "loss": 0.03, "step": 12118 }, { "epoch": 0.9435258624884434, "grad_norm": 0.42830739694093434, "learning_rate": 3.2654717195435004e-07, "loss": 0.0337, "step": 12119 }, { "epoch": 0.943603717580653, "grad_norm": 0.3612042127490931, "learning_rate": 3.2564942433771287e-07, "loss": 0.028, "step": 12120 }, { "epoch": 0.9436815726728627, "grad_norm": 0.3811123801212802, "learning_rate": 3.247529023406415e-07, "loss": 0.0239, "step": 12121 }, { "epoch": 0.9437594277650723, "grad_norm": 0.46812076529309876, "learning_rate": 3.238576060189802e-07, "loss": 0.0436, "step": 12122 }, { "epoch": 0.9438372828572819, "grad_norm": 0.4332040860204757, "learning_rate": 3.229635354285043e-07, "loss": 0.0392, "step": 12123 }, { "epoch": 0.9439151379494916, "grad_norm": 0.3130006381676825, "learning_rate": 3.2207069062491157e-07, "loss": 0.0164, "step": 12124 }, { "epoch": 0.9439929930417011, "grad_norm": 0.4423661757531819, "learning_rate": 3.2117907166381965e-07, "loss": 0.0328, "step": 12125 }, { "epoch": 0.9440708481339107, "grad_norm": 0.46724320891483095, "learning_rate": 3.202886786007731e-07, "loss": 0.0448, "step": 12126 }, { "epoch": 0.9441487032261204, "grad_norm": 0.4848569979354443, "learning_rate": 3.193995114912407e-07, "loss": 0.0379, "step": 12127 }, { "epoch": 0.94422655831833, "grad_norm": 0.3464136887173992, "learning_rate": 3.185115703906072e-07, "loss": 0.0244, "step": 12128 }, { "epoch": 0.9443044134105396, "grad_norm": 0.38595250993502095, "learning_rate": 3.1762485535419497e-07, "loss": 0.0263, "step": 12129 }, { "epoch": 0.9443822685027493, "grad_norm": 0.499885540661683, "learning_rate": 3.1673936643723755e-07, "loss": 0.0368, "step": 12130 }, { "epoch": 0.9444601235949589, "grad_norm": 0.3206423019340715, "learning_rate": 3.158551036948998e-07, "loss": 0.0189, "step": 12131 }, { "epoch": 0.9445379786871685, "grad_norm": 0.4300774842902629, "learning_rate": 3.1497206718226423e-07, "loss": 0.0331, "step": 12132 }, { "epoch": 0.9446158337793781, "grad_norm": 0.29932585547897056, "learning_rate": 3.1409025695434027e-07, "loss": 0.0249, "step": 12133 }, { "epoch": 0.9446936888715878, "grad_norm": 0.5056615537467266, "learning_rate": 3.1320967306606387e-07, "loss": 0.0453, "step": 12134 }, { "epoch": 0.9447715439637974, "grad_norm": 0.5253721901235356, "learning_rate": 3.1233031557228677e-07, "loss": 0.0489, "step": 12135 }, { "epoch": 0.944849399056007, "grad_norm": 0.4604801380674099, "learning_rate": 3.114521845277918e-07, "loss": 0.0529, "step": 12136 }, { "epoch": 0.9449272541482167, "grad_norm": 0.4010154484613059, "learning_rate": 3.105752799872797e-07, "loss": 0.0363, "step": 12137 }, { "epoch": 0.9450051092404262, "grad_norm": 0.4968920539792764, "learning_rate": 3.0969960200538217e-07, "loss": 0.0443, "step": 12138 }, { "epoch": 0.9450829643326358, "grad_norm": 0.3573542620844432, "learning_rate": 3.0882515063664243e-07, "loss": 0.0216, "step": 12139 }, { "epoch": 0.9451608194248455, "grad_norm": 0.5116818731071996, "learning_rate": 3.079519259355412e-07, "loss": 0.0555, "step": 12140 }, { "epoch": 0.9452386745170551, "grad_norm": 0.4329532767762204, "learning_rate": 3.0707992795647513e-07, "loss": 0.036, "step": 12141 }, { "epoch": 0.9453165296092647, "grad_norm": 0.426381499148277, "learning_rate": 3.0620915675376503e-07, "loss": 0.0424, "step": 12142 }, { "epoch": 0.9453943847014744, "grad_norm": 0.40560848840199765, "learning_rate": 3.0533961238165433e-07, "loss": 0.0282, "step": 12143 }, { "epoch": 0.945472239793684, "grad_norm": 0.5562590182707552, "learning_rate": 3.044712948943129e-07, "loss": 0.051, "step": 12144 }, { "epoch": 0.9455500948858936, "grad_norm": 0.489279144879118, "learning_rate": 3.0360420434583317e-07, "loss": 0.0467, "step": 12145 }, { "epoch": 0.9456279499781033, "grad_norm": 0.37429483181830464, "learning_rate": 3.027383407902296e-07, "loss": 0.0277, "step": 12146 }, { "epoch": 0.9457058050703129, "grad_norm": 0.48824322130357906, "learning_rate": 3.0187370428144346e-07, "loss": 0.0436, "step": 12147 }, { "epoch": 0.9457836601625225, "grad_norm": 0.46593342671368404, "learning_rate": 3.01010294873334e-07, "loss": 0.0373, "step": 12148 }, { "epoch": 0.9458615152547322, "grad_norm": 0.36592919334253615, "learning_rate": 3.0014811261968923e-07, "loss": 0.0225, "step": 12149 }, { "epoch": 0.9459393703469418, "grad_norm": 0.34299349300654, "learning_rate": 2.992871575742218e-07, "loss": 0.0226, "step": 12150 }, { "epoch": 0.9459393703469418, "eval_loss": 0.004798902198672295, "eval_runtime": 162.2685, "eval_samples_per_second": 17.748, "eval_steps_per_second": 0.635, "step": 12150 }, { "epoch": 0.9460172254391513, "grad_norm": 0.507415504391509, "learning_rate": 2.9842742979056205e-07, "loss": 0.0449, "step": 12151 }, { "epoch": 0.946095080531361, "grad_norm": 0.6074077838028308, "learning_rate": 2.975689293222694e-07, "loss": 0.0516, "step": 12152 }, { "epoch": 0.9461729356235706, "grad_norm": 0.4513470924680499, "learning_rate": 2.9671165622282116e-07, "loss": 0.0292, "step": 12153 }, { "epoch": 0.9462507907157802, "grad_norm": 0.36162268753867943, "learning_rate": 2.958556105456234e-07, "loss": 0.0297, "step": 12154 }, { "epoch": 0.9463286458079899, "grad_norm": 0.3929570041885372, "learning_rate": 2.9500079234400236e-07, "loss": 0.0315, "step": 12155 }, { "epoch": 0.9464065009001995, "grad_norm": 0.47514377906984795, "learning_rate": 2.9414720167121324e-07, "loss": 0.043, "step": 12156 }, { "epoch": 0.9464843559924091, "grad_norm": 0.5476958325862114, "learning_rate": 2.9329483858042464e-07, "loss": 0.0637, "step": 12157 }, { "epoch": 0.9465622110846188, "grad_norm": 0.40643504372741146, "learning_rate": 2.9244370312474067e-07, "loss": 0.0302, "step": 12158 }, { "epoch": 0.9466400661768284, "grad_norm": 0.37228436744390453, "learning_rate": 2.915937953571768e-07, "loss": 0.0339, "step": 12159 }, { "epoch": 0.946717921269038, "grad_norm": 0.48303486746187124, "learning_rate": 2.9074511533068393e-07, "loss": 0.0407, "step": 12160 }, { "epoch": 0.9467957763612477, "grad_norm": 0.5432144755486409, "learning_rate": 2.8989766309812873e-07, "loss": 0.0408, "step": 12161 }, { "epoch": 0.9468736314534573, "grad_norm": 0.4217244258625926, "learning_rate": 2.8905143871230223e-07, "loss": 0.0305, "step": 12162 }, { "epoch": 0.9469514865456669, "grad_norm": 0.7323732839801694, "learning_rate": 2.882064422259245e-07, "loss": 0.0898, "step": 12163 }, { "epoch": 0.9470293416378766, "grad_norm": 0.5035707476797349, "learning_rate": 2.873626736916291e-07, "loss": 0.0396, "step": 12164 }, { "epoch": 0.9471071967300861, "grad_norm": 0.4122307815970207, "learning_rate": 2.8652013316198267e-07, "loss": 0.0312, "step": 12165 }, { "epoch": 0.9471850518222957, "grad_norm": 0.41271982504743526, "learning_rate": 2.8567882068947006e-07, "loss": 0.0307, "step": 12166 }, { "epoch": 0.9472629069145054, "grad_norm": 0.3709620849261069, "learning_rate": 2.848387363265004e-07, "loss": 0.024, "step": 12167 }, { "epoch": 0.947340762006715, "grad_norm": 0.5127668838064902, "learning_rate": 2.8399988012540957e-07, "loss": 0.054, "step": 12168 }, { "epoch": 0.9474186170989246, "grad_norm": 0.5612788992023265, "learning_rate": 2.8316225213845137e-07, "loss": 0.0525, "step": 12169 }, { "epoch": 0.9474964721911342, "grad_norm": 0.4716253459292928, "learning_rate": 2.8232585241780853e-07, "loss": 0.0442, "step": 12170 }, { "epoch": 0.9475743272833439, "grad_norm": 0.29626626868304173, "learning_rate": 2.8149068101558376e-07, "loss": 0.0217, "step": 12171 }, { "epoch": 0.9476521823755535, "grad_norm": 0.3943514832700177, "learning_rate": 2.8065673798380653e-07, "loss": 0.0242, "step": 12172 }, { "epoch": 0.9477300374677631, "grad_norm": 0.451670526769257, "learning_rate": 2.7982402337442203e-07, "loss": 0.0364, "step": 12173 }, { "epoch": 0.9478078925599728, "grad_norm": 0.3976890401049885, "learning_rate": 2.78992537239311e-07, "loss": 0.0174, "step": 12174 }, { "epoch": 0.9478857476521824, "grad_norm": 0.4191742632030095, "learning_rate": 2.78162279630263e-07, "loss": 0.0281, "step": 12175 }, { "epoch": 0.947963602744392, "grad_norm": 0.4640722473943059, "learning_rate": 2.773332505990078e-07, "loss": 0.0426, "step": 12176 }, { "epoch": 0.9480414578366017, "grad_norm": 0.49699028117688354, "learning_rate": 2.7650545019718646e-07, "loss": 0.0497, "step": 12177 }, { "epoch": 0.9481193129288112, "grad_norm": 0.45096078272134804, "learning_rate": 2.756788784763664e-07, "loss": 0.0353, "step": 12178 }, { "epoch": 0.9481971680210208, "grad_norm": 0.4117996221325518, "learning_rate": 2.7485353548803996e-07, "loss": 0.0371, "step": 12179 }, { "epoch": 0.9482750231132305, "grad_norm": 0.45906004135518336, "learning_rate": 2.740294212836192e-07, "loss": 0.0326, "step": 12180 }, { "epoch": 0.9483528782054401, "grad_norm": 0.31797624501697586, "learning_rate": 2.732065359144498e-07, "loss": 0.0196, "step": 12181 }, { "epoch": 0.9484307332976497, "grad_norm": 0.3809148594271249, "learning_rate": 2.7238487943178624e-07, "loss": 0.0262, "step": 12182 }, { "epoch": 0.9485085883898594, "grad_norm": 0.521655626644227, "learning_rate": 2.715644518868166e-07, "loss": 0.0439, "step": 12183 }, { "epoch": 0.948586443482069, "grad_norm": 0.38517871887408495, "learning_rate": 2.7074525333065096e-07, "loss": 0.0328, "step": 12184 }, { "epoch": 0.9486642985742786, "grad_norm": 0.4127394403091173, "learning_rate": 2.6992728381431966e-07, "loss": 0.0274, "step": 12185 }, { "epoch": 0.9487421536664883, "grad_norm": 0.46610513658396613, "learning_rate": 2.691105433887797e-07, "loss": 0.0394, "step": 12186 }, { "epoch": 0.9488200087586979, "grad_norm": 0.43106142565138733, "learning_rate": 2.682950321049083e-07, "loss": 0.0353, "step": 12187 }, { "epoch": 0.9488978638509075, "grad_norm": 0.44051552760333473, "learning_rate": 2.6748075001350904e-07, "loss": 0.0438, "step": 12188 }, { "epoch": 0.9489757189431172, "grad_norm": 0.42868781166757586, "learning_rate": 2.666676971653104e-07, "loss": 0.0395, "step": 12189 }, { "epoch": 0.9490535740353268, "grad_norm": 0.5148798167272337, "learning_rate": 2.6585587361095844e-07, "loss": 0.06, "step": 12190 }, { "epoch": 0.9491314291275363, "grad_norm": 0.6359440837650524, "learning_rate": 2.650452794010261e-07, "loss": 0.0592, "step": 12191 }, { "epoch": 0.949209284219746, "grad_norm": 0.48526090860222487, "learning_rate": 2.6423591458601074e-07, "loss": 0.038, "step": 12192 }, { "epoch": 0.9492871393119556, "grad_norm": 0.39072039167251976, "learning_rate": 2.63427779216332e-07, "loss": 0.027, "step": 12193 }, { "epoch": 0.9493649944041652, "grad_norm": 0.47209610455338946, "learning_rate": 2.626208733423319e-07, "loss": 0.0485, "step": 12194 }, { "epoch": 0.9494428494963749, "grad_norm": 0.41282381166865467, "learning_rate": 2.6181519701427684e-07, "loss": 0.0265, "step": 12195 }, { "epoch": 0.9495207045885845, "grad_norm": 0.4149097711856581, "learning_rate": 2.610107502823578e-07, "loss": 0.0361, "step": 12196 }, { "epoch": 0.9495985596807941, "grad_norm": 0.30465416643354976, "learning_rate": 2.60207533196688e-07, "loss": 0.02, "step": 12197 }, { "epoch": 0.9496764147730038, "grad_norm": 0.5170211734850355, "learning_rate": 2.5940554580730303e-07, "loss": 0.0553, "step": 12198 }, { "epoch": 0.9497542698652134, "grad_norm": 0.4405084373456964, "learning_rate": 2.586047881641651e-07, "loss": 0.0312, "step": 12199 }, { "epoch": 0.949832124957423, "grad_norm": 0.47545008517727494, "learning_rate": 2.5780526031715435e-07, "loss": 0.046, "step": 12200 }, { "epoch": 0.949832124957423, "eval_loss": 0.004786441568285227, "eval_runtime": 162.4034, "eval_samples_per_second": 17.734, "eval_steps_per_second": 0.634, "step": 12200 }, { "epoch": 0.9499099800496327, "grad_norm": 0.4499827735823044, "learning_rate": 2.570069623160798e-07, "loss": 0.0285, "step": 12201 }, { "epoch": 0.9499878351418423, "grad_norm": 0.38094453921127724, "learning_rate": 2.562098942106728e-07, "loss": 0.0285, "step": 12202 }, { "epoch": 0.9500656902340519, "grad_norm": 0.4509511296168289, "learning_rate": 2.554140560505847e-07, "loss": 0.0321, "step": 12203 }, { "epoch": 0.9501435453262614, "grad_norm": 0.5330078916194443, "learning_rate": 2.546194478853914e-07, "loss": 0.0466, "step": 12204 }, { "epoch": 0.9502214004184711, "grad_norm": 0.5381696163400019, "learning_rate": 2.5382606976459554e-07, "loss": 0.0532, "step": 12205 }, { "epoch": 0.9502992555106807, "grad_norm": 0.4233993382328434, "learning_rate": 2.530339217376221e-07, "loss": 0.0409, "step": 12206 }, { "epoch": 0.9503771106028903, "grad_norm": 0.3678515404259467, "learning_rate": 2.5224300385381374e-07, "loss": 0.0268, "step": 12207 }, { "epoch": 0.9504549656951, "grad_norm": 0.46388337433967053, "learning_rate": 2.5145331616244663e-07, "loss": 0.0283, "step": 12208 }, { "epoch": 0.9505328207873096, "grad_norm": 0.4558924362615259, "learning_rate": 2.506648587127103e-07, "loss": 0.0383, "step": 12209 }, { "epoch": 0.9506106758795192, "grad_norm": 0.5014245603206647, "learning_rate": 2.498776315537232e-07, "loss": 0.0487, "step": 12210 }, { "epoch": 0.9506885309717289, "grad_norm": 0.4076693180830894, "learning_rate": 2.49091634734524e-07, "loss": 0.0328, "step": 12211 }, { "epoch": 0.9507663860639385, "grad_norm": 0.3259692223281289, "learning_rate": 2.483068683040801e-07, "loss": 0.0219, "step": 12212 }, { "epoch": 0.9508442411561481, "grad_norm": 0.5395842959195356, "learning_rate": 2.475233323112791e-07, "loss": 0.0631, "step": 12213 }, { "epoch": 0.9509220962483578, "grad_norm": 0.45946757681631506, "learning_rate": 2.467410268049264e-07, "loss": 0.0439, "step": 12214 }, { "epoch": 0.9509999513405674, "grad_norm": 0.49044946337544665, "learning_rate": 2.459599518337608e-07, "loss": 0.0539, "step": 12215 }, { "epoch": 0.951077806432777, "grad_norm": 0.41262679792869156, "learning_rate": 2.451801074464366e-07, "loss": 0.027, "step": 12216 }, { "epoch": 0.9511556615249867, "grad_norm": 0.3905544533661113, "learning_rate": 2.444014936915373e-07, "loss": 0.037, "step": 12217 }, { "epoch": 0.9512335166171962, "grad_norm": 0.3521113938746251, "learning_rate": 2.436241106175663e-07, "loss": 0.0231, "step": 12218 }, { "epoch": 0.9513113717094058, "grad_norm": 0.4220367481794253, "learning_rate": 2.4284795827294926e-07, "loss": 0.0435, "step": 12219 }, { "epoch": 0.9513892268016155, "grad_norm": 0.43957066657552557, "learning_rate": 2.420730367060364e-07, "loss": 0.0357, "step": 12220 }, { "epoch": 0.9514670818938251, "grad_norm": 0.40938838437079234, "learning_rate": 2.412993459651047e-07, "loss": 0.0349, "step": 12221 }, { "epoch": 0.9515449369860347, "grad_norm": 0.4024390470381663, "learning_rate": 2.4052688609834897e-07, "loss": 0.0301, "step": 12222 }, { "epoch": 0.9516227920782444, "grad_norm": 0.4204150703922025, "learning_rate": 2.3975565715388836e-07, "loss": 0.031, "step": 12223 }, { "epoch": 0.951700647170454, "grad_norm": 0.37803021065765774, "learning_rate": 2.3898565917977123e-07, "loss": 0.0286, "step": 12224 }, { "epoch": 0.9517785022626636, "grad_norm": 0.5121031496390052, "learning_rate": 2.382168922239636e-07, "loss": 0.0481, "step": 12225 }, { "epoch": 0.9518563573548733, "grad_norm": 0.3242572911400561, "learning_rate": 2.374493563343516e-07, "loss": 0.0236, "step": 12226 }, { "epoch": 0.9519342124470829, "grad_norm": 0.2669265359966931, "learning_rate": 2.3668305155875482e-07, "loss": 0.0136, "step": 12227 }, { "epoch": 0.9520120675392925, "grad_norm": 0.40200910398426776, "learning_rate": 2.3591797794490835e-07, "loss": 0.0353, "step": 12228 }, { "epoch": 0.9520899226315022, "grad_norm": 0.4302983885307902, "learning_rate": 2.3515413554047185e-07, "loss": 0.0336, "step": 12229 }, { "epoch": 0.9521677777237118, "grad_norm": 0.35907905963861925, "learning_rate": 2.343915243930317e-07, "loss": 0.0283, "step": 12230 }, { "epoch": 0.9522456328159213, "grad_norm": 0.5472133446885319, "learning_rate": 2.3363014455008993e-07, "loss": 0.0498, "step": 12231 }, { "epoch": 0.952323487908131, "grad_norm": 0.546793472915791, "learning_rate": 2.3286999605907968e-07, "loss": 0.0574, "step": 12232 }, { "epoch": 0.9524013430003406, "grad_norm": 0.3893613762284801, "learning_rate": 2.321110789673564e-07, "loss": 0.0203, "step": 12233 }, { "epoch": 0.9524791980925502, "grad_norm": 0.41149303725895975, "learning_rate": 2.3135339332219565e-07, "loss": 0.0295, "step": 12234 }, { "epoch": 0.9525570531847599, "grad_norm": 0.31675759965718636, "learning_rate": 2.3059693917079962e-07, "loss": 0.019, "step": 12235 }, { "epoch": 0.9526349082769695, "grad_norm": 0.5725653523580482, "learning_rate": 2.298417165602862e-07, "loss": 0.0733, "step": 12236 }, { "epoch": 0.9527127633691791, "grad_norm": 0.3523280611928789, "learning_rate": 2.2908772553770663e-07, "loss": 0.0272, "step": 12237 }, { "epoch": 0.9527906184613888, "grad_norm": 0.3559603287827199, "learning_rate": 2.2833496615003227e-07, "loss": 0.0251, "step": 12238 }, { "epoch": 0.9528684735535984, "grad_norm": 0.3806482397913208, "learning_rate": 2.275834384441544e-07, "loss": 0.0251, "step": 12239 }, { "epoch": 0.952946328645808, "grad_norm": 0.3677179906419074, "learning_rate": 2.2683314246688902e-07, "loss": 0.032, "step": 12240 }, { "epoch": 0.9530241837380176, "grad_norm": 0.5092906811677729, "learning_rate": 2.2608407826497646e-07, "loss": 0.0414, "step": 12241 }, { "epoch": 0.9531020388302273, "grad_norm": 0.35031299283350364, "learning_rate": 2.253362458850794e-07, "loss": 0.0219, "step": 12242 }, { "epoch": 0.9531798939224368, "grad_norm": 0.4223784423314686, "learning_rate": 2.2458964537378726e-07, "loss": 0.0348, "step": 12243 }, { "epoch": 0.9532577490146464, "grad_norm": 0.5468058414653321, "learning_rate": 2.2384427677760502e-07, "loss": 0.0624, "step": 12244 }, { "epoch": 0.9533356041068561, "grad_norm": 0.4250421937200326, "learning_rate": 2.2310014014296888e-07, "loss": 0.0286, "step": 12245 }, { "epoch": 0.9534134591990657, "grad_norm": 0.31294864617167184, "learning_rate": 2.2235723551623512e-07, "loss": 0.0196, "step": 12246 }, { "epoch": 0.9534913142912753, "grad_norm": 0.4422131745549845, "learning_rate": 2.2161556294368226e-07, "loss": 0.0314, "step": 12247 }, { "epoch": 0.953569169383485, "grad_norm": 0.38179117509349564, "learning_rate": 2.2087512247151554e-07, "loss": 0.0249, "step": 12248 }, { "epoch": 0.9536470244756946, "grad_norm": 0.3905508263701058, "learning_rate": 2.2013591414585588e-07, "loss": 0.0291, "step": 12249 }, { "epoch": 0.9537248795679042, "grad_norm": 0.44700965131924425, "learning_rate": 2.1939793801275534e-07, "loss": 0.0469, "step": 12250 }, { "epoch": 0.9537248795679042, "eval_loss": 0.004776487592607737, "eval_runtime": 162.4113, "eval_samples_per_second": 17.733, "eval_steps_per_second": 0.634, "step": 12250 }, { "epoch": 0.9538027346601139, "grad_norm": 0.40321105426649173, "learning_rate": 2.18661194118186e-07, "loss": 0.0223, "step": 12251 }, { "epoch": 0.9538805897523235, "grad_norm": 0.3266265781220968, "learning_rate": 2.1792568250804446e-07, "loss": 0.0257, "step": 12252 }, { "epoch": 0.9539584448445331, "grad_norm": 0.35687811633777294, "learning_rate": 2.1719140322814746e-07, "loss": 0.0247, "step": 12253 }, { "epoch": 0.9540362999367428, "grad_norm": 0.4423057995116255, "learning_rate": 2.1645835632424062e-07, "loss": 0.0479, "step": 12254 }, { "epoch": 0.9541141550289524, "grad_norm": 0.4383791782597049, "learning_rate": 2.1572654184198737e-07, "loss": 0.042, "step": 12255 }, { "epoch": 0.954192010121162, "grad_norm": 0.44458750015337656, "learning_rate": 2.1499595982697575e-07, "loss": 0.0319, "step": 12256 }, { "epoch": 0.9542698652133716, "grad_norm": 0.4794628522486378, "learning_rate": 2.142666103247182e-07, "loss": 0.0411, "step": 12257 }, { "epoch": 0.9543477203055812, "grad_norm": 0.386819896410446, "learning_rate": 2.1353849338064724e-07, "loss": 0.0312, "step": 12258 }, { "epoch": 0.9544255753977908, "grad_norm": 0.4016084376280712, "learning_rate": 2.1281160904012665e-07, "loss": 0.0334, "step": 12259 }, { "epoch": 0.9545034304900005, "grad_norm": 0.48952299595293797, "learning_rate": 2.120859573484335e-07, "loss": 0.0404, "step": 12260 }, { "epoch": 0.9545812855822101, "grad_norm": 0.3691584255599223, "learning_rate": 2.113615383507739e-07, "loss": 0.0273, "step": 12261 }, { "epoch": 0.9546591406744197, "grad_norm": 0.35002501720154133, "learning_rate": 2.1063835209227613e-07, "loss": 0.0238, "step": 12262 }, { "epoch": 0.9547369957666294, "grad_norm": 0.39055686275470847, "learning_rate": 2.0991639861799084e-07, "loss": 0.0256, "step": 12263 }, { "epoch": 0.954814850858839, "grad_norm": 0.3809522870498133, "learning_rate": 2.09195677972891e-07, "loss": 0.03, "step": 12264 }, { "epoch": 0.9548927059510486, "grad_norm": 0.48719726563637905, "learning_rate": 2.0847619020187616e-07, "loss": 0.0392, "step": 12265 }, { "epoch": 0.9549705610432583, "grad_norm": 0.4178060507509501, "learning_rate": 2.0775793534976828e-07, "loss": 0.0277, "step": 12266 }, { "epoch": 0.9550484161354679, "grad_norm": 0.3374015178354918, "learning_rate": 2.0704091346130718e-07, "loss": 0.0337, "step": 12267 }, { "epoch": 0.9551262712276775, "grad_norm": 0.4359536245223932, "learning_rate": 2.063251245811615e-07, "loss": 0.0313, "step": 12268 }, { "epoch": 0.9552041263198872, "grad_norm": 0.42704656459557955, "learning_rate": 2.056105687539245e-07, "loss": 0.0418, "step": 12269 }, { "epoch": 0.9552819814120967, "grad_norm": 0.44005247002974585, "learning_rate": 2.04897246024105e-07, "loss": 0.0391, "step": 12270 }, { "epoch": 0.9553598365043063, "grad_norm": 0.3703229463074025, "learning_rate": 2.0418515643614302e-07, "loss": 0.0194, "step": 12271 }, { "epoch": 0.955437691596516, "grad_norm": 0.4142488466744628, "learning_rate": 2.0347430003439638e-07, "loss": 0.0358, "step": 12272 }, { "epoch": 0.9555155466887256, "grad_norm": 0.3593143040012558, "learning_rate": 2.027646768631475e-07, "loss": 0.0267, "step": 12273 }, { "epoch": 0.9555934017809352, "grad_norm": 0.4152618610848531, "learning_rate": 2.020562869666076e-07, "loss": 0.0308, "step": 12274 }, { "epoch": 0.9556712568731449, "grad_norm": 0.53135724944593, "learning_rate": 2.0134913038889925e-07, "loss": 0.0576, "step": 12275 }, { "epoch": 0.9557491119653545, "grad_norm": 0.3970239780950325, "learning_rate": 2.0064320717408048e-07, "loss": 0.0348, "step": 12276 }, { "epoch": 0.9558269670575641, "grad_norm": 0.34242496474836714, "learning_rate": 1.99938517366125e-07, "loss": 0.0191, "step": 12277 }, { "epoch": 0.9559048221497737, "grad_norm": 0.4634041123913961, "learning_rate": 1.9923506100892887e-07, "loss": 0.0501, "step": 12278 }, { "epoch": 0.9559826772419834, "grad_norm": 0.3910761717018554, "learning_rate": 1.985328381463192e-07, "loss": 0.0331, "step": 12279 }, { "epoch": 0.956060532334193, "grad_norm": 0.5477900111899442, "learning_rate": 1.9783184882203655e-07, "loss": 0.061, "step": 12280 }, { "epoch": 0.9561383874264026, "grad_norm": 0.47737783692354513, "learning_rate": 1.9713209307975267e-07, "loss": 0.034, "step": 12281 }, { "epoch": 0.9562162425186123, "grad_norm": 0.3474928308410592, "learning_rate": 1.9643357096305716e-07, "loss": 0.0228, "step": 12282 }, { "epoch": 0.9562940976108218, "grad_norm": 0.5190839862831085, "learning_rate": 1.9573628251546406e-07, "loss": 0.04, "step": 12283 }, { "epoch": 0.9563719527030314, "grad_norm": 0.44337748485713574, "learning_rate": 1.950402277804142e-07, "loss": 0.0297, "step": 12284 }, { "epoch": 0.9564498077952411, "grad_norm": 0.30438687755025456, "learning_rate": 1.9434540680126622e-07, "loss": 0.0168, "step": 12285 }, { "epoch": 0.9565276628874507, "grad_norm": 0.3143285483555362, "learning_rate": 1.9365181962130552e-07, "loss": 0.0217, "step": 12286 }, { "epoch": 0.9566055179796603, "grad_norm": 0.41410543504738484, "learning_rate": 1.929594662837375e-07, "loss": 0.0343, "step": 12287 }, { "epoch": 0.95668337307187, "grad_norm": 0.4519045812159035, "learning_rate": 1.9226834683169214e-07, "loss": 0.0484, "step": 12288 }, { "epoch": 0.9567612281640796, "grad_norm": 0.4768451310918281, "learning_rate": 1.915784613082261e-07, "loss": 0.0381, "step": 12289 }, { "epoch": 0.9568390832562892, "grad_norm": 0.5170186521363266, "learning_rate": 1.9088980975631165e-07, "loss": 0.0407, "step": 12290 }, { "epoch": 0.9569169383484989, "grad_norm": 0.38976296675808947, "learning_rate": 1.902023922188545e-07, "loss": 0.0288, "step": 12291 }, { "epoch": 0.9569947934407085, "grad_norm": 0.416288598607724, "learning_rate": 1.895162087386715e-07, "loss": 0.0399, "step": 12292 }, { "epoch": 0.9570726485329181, "grad_norm": 0.4668044266047884, "learning_rate": 1.8883125935851066e-07, "loss": 0.0401, "step": 12293 }, { "epoch": 0.9571505036251278, "grad_norm": 0.4493120223259934, "learning_rate": 1.881475441210423e-07, "loss": 0.0383, "step": 12294 }, { "epoch": 0.9572283587173374, "grad_norm": 0.3682992600924978, "learning_rate": 1.8746506306885902e-07, "loss": 0.0241, "step": 12295 }, { "epoch": 0.957306213809547, "grad_norm": 0.46024218337653733, "learning_rate": 1.867838162444757e-07, "loss": 0.0383, "step": 12296 }, { "epoch": 0.9573840689017566, "grad_norm": 0.30557319888445117, "learning_rate": 1.8610380369032953e-07, "loss": 0.0193, "step": 12297 }, { "epoch": 0.9574619239939662, "grad_norm": 0.39130083139373417, "learning_rate": 1.8542502544878216e-07, "loss": 0.0384, "step": 12298 }, { "epoch": 0.9575397790861758, "grad_norm": 0.5191964177504589, "learning_rate": 1.8474748156211973e-07, "loss": 0.0509, "step": 12299 }, { "epoch": 0.9576176341783855, "grad_norm": 0.35066354062255245, "learning_rate": 1.8407117207254855e-07, "loss": 0.0271, "step": 12300 }, { "epoch": 0.9576176341783855, "eval_loss": 0.004768344573676586, "eval_runtime": 163.3357, "eval_samples_per_second": 17.632, "eval_steps_per_second": 0.631, "step": 12300 }, { "epoch": 0.9576954892705951, "grad_norm": 0.3411453492645743, "learning_rate": 1.8339609702220152e-07, "loss": 0.0264, "step": 12301 }, { "epoch": 0.9577733443628047, "grad_norm": 0.43883528958389817, "learning_rate": 1.8272225645312948e-07, "loss": 0.041, "step": 12302 }, { "epoch": 0.9578511994550144, "grad_norm": 0.42512202988411707, "learning_rate": 1.8204965040731214e-07, "loss": 0.0321, "step": 12303 }, { "epoch": 0.957929054547224, "grad_norm": 0.41745869513441064, "learning_rate": 1.8137827892664939e-07, "loss": 0.0343, "step": 12304 }, { "epoch": 0.9580069096394336, "grad_norm": 0.44798051388261007, "learning_rate": 1.8070814205296327e-07, "loss": 0.0464, "step": 12305 }, { "epoch": 0.9580847647316433, "grad_norm": 0.5605889830857662, "learning_rate": 1.8003923982800264e-07, "loss": 0.0584, "step": 12306 }, { "epoch": 0.9581626198238529, "grad_norm": 0.4910439689822377, "learning_rate": 1.7937157229343417e-07, "loss": 0.0431, "step": 12307 }, { "epoch": 0.9582404749160625, "grad_norm": 0.3195510238634168, "learning_rate": 1.787051394908512e-07, "loss": 0.0183, "step": 12308 }, { "epoch": 0.9583183300082722, "grad_norm": 0.5064251788862679, "learning_rate": 1.7803994146177173e-07, "loss": 0.0547, "step": 12309 }, { "epoch": 0.9583961851004817, "grad_norm": 0.38893950968517366, "learning_rate": 1.7737597824763142e-07, "loss": 0.0266, "step": 12310 }, { "epoch": 0.9584740401926913, "grad_norm": 0.30279700478546917, "learning_rate": 1.7671324988979276e-07, "loss": 0.0157, "step": 12311 }, { "epoch": 0.9585518952849009, "grad_norm": 0.40286579443761183, "learning_rate": 1.7605175642954053e-07, "loss": 0.0306, "step": 12312 }, { "epoch": 0.9586297503771106, "grad_norm": 0.4166792573857162, "learning_rate": 1.7539149790808175e-07, "loss": 0.0393, "step": 12313 }, { "epoch": 0.9587076054693202, "grad_norm": 0.3975310887612088, "learning_rate": 1.7473247436655018e-07, "loss": 0.0316, "step": 12314 }, { "epoch": 0.9587854605615298, "grad_norm": 0.3587598575380289, "learning_rate": 1.7407468584600184e-07, "loss": 0.021, "step": 12315 }, { "epoch": 0.9588633156537395, "grad_norm": 0.5194402174088901, "learning_rate": 1.7341813238740624e-07, "loss": 0.0485, "step": 12316 }, { "epoch": 0.9589411707459491, "grad_norm": 0.42600508080404687, "learning_rate": 1.727628140316706e-07, "loss": 0.0336, "step": 12317 }, { "epoch": 0.9590190258381587, "grad_norm": 0.4779533305185685, "learning_rate": 1.721087308196112e-07, "loss": 0.039, "step": 12318 }, { "epoch": 0.9590968809303684, "grad_norm": 0.5783373405271578, "learning_rate": 1.714558827919821e-07, "loss": 0.0504, "step": 12319 }, { "epoch": 0.959174736022578, "grad_norm": 0.3996960142794391, "learning_rate": 1.7080426998944854e-07, "loss": 0.0347, "step": 12320 }, { "epoch": 0.9592525911147876, "grad_norm": 0.44639038775255296, "learning_rate": 1.701538924526025e-07, "loss": 0.0295, "step": 12321 }, { "epoch": 0.9593304462069973, "grad_norm": 0.3941906402083973, "learning_rate": 1.6950475022196266e-07, "loss": 0.0274, "step": 12322 }, { "epoch": 0.9594083012992068, "grad_norm": 0.4831136529951224, "learning_rate": 1.6885684333796114e-07, "loss": 0.0514, "step": 12323 }, { "epoch": 0.9594861563914164, "grad_norm": 0.4728846550809182, "learning_rate": 1.6821017184096788e-07, "loss": 0.0426, "step": 12324 }, { "epoch": 0.9595640114836261, "grad_norm": 0.6032399669215379, "learning_rate": 1.6756473577126174e-07, "loss": 0.0605, "step": 12325 }, { "epoch": 0.9596418665758357, "grad_norm": 0.4497673965646203, "learning_rate": 1.6692053516905282e-07, "loss": 0.0307, "step": 12326 }, { "epoch": 0.9597197216680453, "grad_norm": 0.49558138073081964, "learning_rate": 1.66277570074469e-07, "loss": 0.0502, "step": 12327 }, { "epoch": 0.959797576760255, "grad_norm": 0.5292578212412208, "learning_rate": 1.656358405275671e-07, "loss": 0.0546, "step": 12328 }, { "epoch": 0.9598754318524646, "grad_norm": 0.28537386668198367, "learning_rate": 1.6499534656832183e-07, "loss": 0.0121, "step": 12329 }, { "epoch": 0.9599532869446742, "grad_norm": 0.3874967864349845, "learning_rate": 1.6435608823663464e-07, "loss": 0.0252, "step": 12330 }, { "epoch": 0.9600311420368839, "grad_norm": 0.6172594300604815, "learning_rate": 1.6371806557232694e-07, "loss": 0.0651, "step": 12331 }, { "epoch": 0.9601089971290935, "grad_norm": 0.4408377404527462, "learning_rate": 1.6308127861514477e-07, "loss": 0.032, "step": 12332 }, { "epoch": 0.9601868522213031, "grad_norm": 0.37936691068202777, "learning_rate": 1.624457274047564e-07, "loss": 0.0255, "step": 12333 }, { "epoch": 0.9602647073135128, "grad_norm": 0.42897011681612635, "learning_rate": 1.618114119807568e-07, "loss": 0.0337, "step": 12334 }, { "epoch": 0.9603425624057224, "grad_norm": 0.36718918683049123, "learning_rate": 1.6117833238265656e-07, "loss": 0.0292, "step": 12335 }, { "epoch": 0.9604204174979319, "grad_norm": 0.3111414358986098, "learning_rate": 1.605464886498953e-07, "loss": 0.0146, "step": 12336 }, { "epoch": 0.9604982725901416, "grad_norm": 0.4625109368517957, "learning_rate": 1.5991588082183486e-07, "loss": 0.038, "step": 12337 }, { "epoch": 0.9605761276823512, "grad_norm": 0.5931225978934673, "learning_rate": 1.592865089377593e-07, "loss": 0.05, "step": 12338 }, { "epoch": 0.9606539827745608, "grad_norm": 0.37537913286476127, "learning_rate": 1.5865837303687293e-07, "loss": 0.0298, "step": 12339 }, { "epoch": 0.9607318378667705, "grad_norm": 0.4289879439441547, "learning_rate": 1.580314731583088e-07, "loss": 0.0384, "step": 12340 }, { "epoch": 0.9608096929589801, "grad_norm": 0.46919501432030264, "learning_rate": 1.5740580934111792e-07, "loss": 0.044, "step": 12341 }, { "epoch": 0.9608875480511897, "grad_norm": 0.47174980180591414, "learning_rate": 1.5678138162427581e-07, "loss": 0.0278, "step": 12342 }, { "epoch": 0.9609654031433994, "grad_norm": 0.3584761784498857, "learning_rate": 1.5615819004668463e-07, "loss": 0.0165, "step": 12343 }, { "epoch": 0.961043258235609, "grad_norm": 0.34935674034438563, "learning_rate": 1.555362346471623e-07, "loss": 0.0256, "step": 12344 }, { "epoch": 0.9611211133278186, "grad_norm": 0.498982797013983, "learning_rate": 1.5491551546445548e-07, "loss": 0.0447, "step": 12345 }, { "epoch": 0.9611989684200283, "grad_norm": 0.5048883367728156, "learning_rate": 1.5429603253723113e-07, "loss": 0.0533, "step": 12346 }, { "epoch": 0.9612768235122379, "grad_norm": 0.35140532591470425, "learning_rate": 1.5367778590408278e-07, "loss": 0.0228, "step": 12347 }, { "epoch": 0.9613546786044475, "grad_norm": 0.4739279849102935, "learning_rate": 1.530607756035196e-07, "loss": 0.0449, "step": 12348 }, { "epoch": 0.961432533696657, "grad_norm": 0.358808795790528, "learning_rate": 1.5244500167398201e-07, "loss": 0.0227, "step": 12349 }, { "epoch": 0.9615103887888667, "grad_norm": 0.552094893972733, "learning_rate": 1.5183046415383041e-07, "loss": 0.0461, "step": 12350 }, { "epoch": 0.9615103887888667, "eval_loss": 0.004762422293424606, "eval_runtime": 162.1326, "eval_samples_per_second": 17.763, "eval_steps_per_second": 0.635, "step": 12350 }, { "epoch": 0.9615882438810763, "grad_norm": 0.4261840304219095, "learning_rate": 1.5121716308134526e-07, "loss": 0.0318, "step": 12351 }, { "epoch": 0.9616660989732859, "grad_norm": 0.4153827024620704, "learning_rate": 1.5060509849473382e-07, "loss": 0.0276, "step": 12352 }, { "epoch": 0.9617439540654956, "grad_norm": 0.46735048993438566, "learning_rate": 1.4999427043212554e-07, "loss": 0.0355, "step": 12353 }, { "epoch": 0.9618218091577052, "grad_norm": 0.48878966002514856, "learning_rate": 1.4938467893157005e-07, "loss": 0.0425, "step": 12354 }, { "epoch": 0.9618996642499148, "grad_norm": 0.35275384307124796, "learning_rate": 1.4877632403104137e-07, "loss": 0.0201, "step": 12355 }, { "epoch": 0.9619775193421245, "grad_norm": 0.5925924219116783, "learning_rate": 1.481692057684403e-07, "loss": 0.049, "step": 12356 }, { "epoch": 0.9620553744343341, "grad_norm": 0.49530045863076894, "learning_rate": 1.475633241815855e-07, "loss": 0.0462, "step": 12357 }, { "epoch": 0.9621332295265437, "grad_norm": 0.5281382723215328, "learning_rate": 1.4695867930822228e-07, "loss": 0.0426, "step": 12358 }, { "epoch": 0.9622110846187534, "grad_norm": 0.4729564205897245, "learning_rate": 1.4635527118601388e-07, "loss": 0.0379, "step": 12359 }, { "epoch": 0.962288939710963, "grad_norm": 0.4062411832229911, "learning_rate": 1.457530998525525e-07, "loss": 0.0278, "step": 12360 }, { "epoch": 0.9623667948031726, "grad_norm": 0.4439018064540844, "learning_rate": 1.4515216534534803e-07, "loss": 0.0453, "step": 12361 }, { "epoch": 0.9624446498953823, "grad_norm": 0.33382784883230937, "learning_rate": 1.445524677018395e-07, "loss": 0.0239, "step": 12362 }, { "epoch": 0.9625225049875918, "grad_norm": 0.42184722074622344, "learning_rate": 1.4395400695938365e-07, "loss": 0.038, "step": 12363 }, { "epoch": 0.9626003600798014, "grad_norm": 0.37451438475234866, "learning_rate": 1.433567831552618e-07, "loss": 0.027, "step": 12364 }, { "epoch": 0.9626782151720111, "grad_norm": 0.4325718216661705, "learning_rate": 1.427607963266775e-07, "loss": 0.0322, "step": 12365 }, { "epoch": 0.9627560702642207, "grad_norm": 0.39261431699705074, "learning_rate": 1.4216604651075662e-07, "loss": 0.0323, "step": 12366 }, { "epoch": 0.9628339253564303, "grad_norm": 0.44387919807490983, "learning_rate": 1.4157253374455392e-07, "loss": 0.0325, "step": 12367 }, { "epoch": 0.96291178044864, "grad_norm": 0.34964324092440363, "learning_rate": 1.4098025806503767e-07, "loss": 0.0306, "step": 12368 }, { "epoch": 0.9629896355408496, "grad_norm": 0.30511931133299447, "learning_rate": 1.40389219509105e-07, "loss": 0.0218, "step": 12369 }, { "epoch": 0.9630674906330592, "grad_norm": 0.5167008178310278, "learning_rate": 1.3979941811357534e-07, "loss": 0.057, "step": 12370 }, { "epoch": 0.9631453457252689, "grad_norm": 0.5010604517541816, "learning_rate": 1.3921085391519261e-07, "loss": 0.0414, "step": 12371 }, { "epoch": 0.9632232008174785, "grad_norm": 0.45045319362872377, "learning_rate": 1.3862352695061866e-07, "loss": 0.0335, "step": 12372 }, { "epoch": 0.9633010559096881, "grad_norm": 0.5177695319367592, "learning_rate": 1.3803743725644415e-07, "loss": 0.0536, "step": 12373 }, { "epoch": 0.9633789110018978, "grad_norm": 0.3497932367903614, "learning_rate": 1.3745258486917545e-07, "loss": 0.0273, "step": 12374 }, { "epoch": 0.9634567660941074, "grad_norm": 0.5868885565684431, "learning_rate": 1.368689698252479e-07, "loss": 0.0676, "step": 12375 }, { "epoch": 0.9635346211863169, "grad_norm": 0.5589253976073736, "learning_rate": 1.3628659216102124e-07, "loss": 0.0399, "step": 12376 }, { "epoch": 0.9636124762785266, "grad_norm": 0.4591610045863323, "learning_rate": 1.3570545191277097e-07, "loss": 0.0365, "step": 12377 }, { "epoch": 0.9636903313707362, "grad_norm": 0.511282846507794, "learning_rate": 1.351255491167014e-07, "loss": 0.0449, "step": 12378 }, { "epoch": 0.9637681864629458, "grad_norm": 0.5840013554035227, "learning_rate": 1.3454688380893477e-07, "loss": 0.0662, "step": 12379 }, { "epoch": 0.9638460415551555, "grad_norm": 0.47425831184398154, "learning_rate": 1.3396945602552225e-07, "loss": 0.0434, "step": 12380 }, { "epoch": 0.9639238966473651, "grad_norm": 0.43106362074091104, "learning_rate": 1.3339326580243506e-07, "loss": 0.0389, "step": 12381 }, { "epoch": 0.9640017517395747, "grad_norm": 0.4671867780296298, "learning_rate": 1.328183131755667e-07, "loss": 0.0435, "step": 12382 }, { "epoch": 0.9640796068317843, "grad_norm": 0.4513039610499869, "learning_rate": 1.3224459818073076e-07, "loss": 0.031, "step": 12383 }, { "epoch": 0.964157461923994, "grad_norm": 0.44503189733283666, "learning_rate": 1.31672120853672e-07, "loss": 0.0414, "step": 12384 }, { "epoch": 0.9642353170162036, "grad_norm": 0.5112444140273436, "learning_rate": 1.311008812300485e-07, "loss": 0.0502, "step": 12385 }, { "epoch": 0.9643131721084132, "grad_norm": 0.44170110999377155, "learning_rate": 1.305308793454496e-07, "loss": 0.0452, "step": 12386 }, { "epoch": 0.9643910272006229, "grad_norm": 0.4853762073763175, "learning_rate": 1.299621152353825e-07, "loss": 0.0391, "step": 12387 }, { "epoch": 0.9644688822928325, "grad_norm": 0.4409857713812177, "learning_rate": 1.293945889352788e-07, "loss": 0.0366, "step": 12388 }, { "epoch": 0.964546737385042, "grad_norm": 0.35687440730738035, "learning_rate": 1.288283004804902e-07, "loss": 0.0271, "step": 12389 }, { "epoch": 0.9646245924772517, "grad_norm": 0.42127125477657495, "learning_rate": 1.2826324990629523e-07, "loss": 0.0308, "step": 12390 }, { "epoch": 0.9647024475694613, "grad_norm": 0.40992188541756475, "learning_rate": 1.276994372478968e-07, "loss": 0.0374, "step": 12391 }, { "epoch": 0.9647803026616709, "grad_norm": 0.4334367220258172, "learning_rate": 1.271368625404157e-07, "loss": 0.038, "step": 12392 }, { "epoch": 0.9648581577538806, "grad_norm": 0.3713485522890462, "learning_rate": 1.26575525818895e-07, "loss": 0.0249, "step": 12393 }, { "epoch": 0.9649360128460902, "grad_norm": 0.46394507462768964, "learning_rate": 1.2601542711830673e-07, "loss": 0.0307, "step": 12394 }, { "epoch": 0.9650138679382998, "grad_norm": 0.4213067079791304, "learning_rate": 1.2545656647354075e-07, "loss": 0.0331, "step": 12395 }, { "epoch": 0.9650917230305095, "grad_norm": 0.39442747246406823, "learning_rate": 1.2489894391941148e-07, "loss": 0.0253, "step": 12396 }, { "epoch": 0.9651695781227191, "grad_norm": 0.5128909760296658, "learning_rate": 1.2434255949065776e-07, "loss": 0.0557, "step": 12397 }, { "epoch": 0.9652474332149287, "grad_norm": 0.5162757708579809, "learning_rate": 1.2378741322194078e-07, "loss": 0.0469, "step": 12398 }, { "epoch": 0.9653252883071384, "grad_norm": 0.5189631510129902, "learning_rate": 1.232335051478395e-07, "loss": 0.0458, "step": 12399 }, { "epoch": 0.965403143399348, "grad_norm": 0.47187890929583914, "learning_rate": 1.226808353028619e-07, "loss": 0.0466, "step": 12400 }, { "epoch": 0.965403143399348, "eval_loss": 0.004754895810037851, "eval_runtime": 162.6456, "eval_samples_per_second": 17.707, "eval_steps_per_second": 0.633, "step": 12400 }, { "epoch": 0.9654809984915576, "grad_norm": 0.6290172595386122, "learning_rate": 1.2212940372143601e-07, "loss": 0.0662, "step": 12401 }, { "epoch": 0.9655588535837673, "grad_norm": 0.49241441829790367, "learning_rate": 1.2157921043791431e-07, "loss": 0.0552, "step": 12402 }, { "epoch": 0.9656367086759768, "grad_norm": 0.476504744080545, "learning_rate": 1.2103025548657166e-07, "loss": 0.0353, "step": 12403 }, { "epoch": 0.9657145637681864, "grad_norm": 0.4802124126998029, "learning_rate": 1.204825389016051e-07, "loss": 0.0356, "step": 12404 }, { "epoch": 0.9657924188603961, "grad_norm": 0.405479990103469, "learning_rate": 1.1993606071713182e-07, "loss": 0.0347, "step": 12405 }, { "epoch": 0.9658702739526057, "grad_norm": 0.615698620209449, "learning_rate": 1.193908209672001e-07, "loss": 0.0573, "step": 12406 }, { "epoch": 0.9659481290448153, "grad_norm": 0.44226202861706415, "learning_rate": 1.1884681968577172e-07, "loss": 0.043, "step": 12407 }, { "epoch": 0.966025984137025, "grad_norm": 0.4696742135429935, "learning_rate": 1.183040569067373e-07, "loss": 0.0439, "step": 12408 }, { "epoch": 0.9661038392292346, "grad_norm": 0.483360969730752, "learning_rate": 1.1776253266390758e-07, "loss": 0.0495, "step": 12409 }, { "epoch": 0.9661816943214442, "grad_norm": 0.46417563475266993, "learning_rate": 1.1722224699101781e-07, "loss": 0.0434, "step": 12410 }, { "epoch": 0.9662595494136539, "grad_norm": 0.4898897264473228, "learning_rate": 1.1668319992172327e-07, "loss": 0.0464, "step": 12411 }, { "epoch": 0.9663374045058635, "grad_norm": 0.38660767793247386, "learning_rate": 1.1614539148960825e-07, "loss": 0.0259, "step": 12412 }, { "epoch": 0.9664152595980731, "grad_norm": 0.32262532620474055, "learning_rate": 1.1560882172817033e-07, "loss": 0.0185, "step": 12413 }, { "epoch": 0.9664931146902828, "grad_norm": 0.3454412579605928, "learning_rate": 1.1507349067084061e-07, "loss": 0.0265, "step": 12414 }, { "epoch": 0.9665709697824924, "grad_norm": 0.33878387234592755, "learning_rate": 1.145393983509635e-07, "loss": 0.0189, "step": 12415 }, { "epoch": 0.9666488248747019, "grad_norm": 0.4585023023216695, "learning_rate": 1.1400654480181017e-07, "loss": 0.035, "step": 12416 }, { "epoch": 0.9667266799669116, "grad_norm": 0.4150221244299008, "learning_rate": 1.134749300565785e-07, "loss": 0.032, "step": 12417 }, { "epoch": 0.9668045350591212, "grad_norm": 0.3338920869245365, "learning_rate": 1.1294455414838423e-07, "loss": 0.0202, "step": 12418 }, { "epoch": 0.9668823901513308, "grad_norm": 0.5997646143491786, "learning_rate": 1.1241541711026538e-07, "loss": 0.0526, "step": 12419 }, { "epoch": 0.9669602452435404, "grad_norm": 0.45597192857640295, "learning_rate": 1.118875189751889e-07, "loss": 0.0394, "step": 12420 }, { "epoch": 0.9670381003357501, "grad_norm": 0.4859849311860519, "learning_rate": 1.1136085977603517e-07, "loss": 0.0455, "step": 12421 }, { "epoch": 0.9671159554279597, "grad_norm": 0.43103293979147006, "learning_rate": 1.1083543954561571e-07, "loss": 0.0445, "step": 12422 }, { "epoch": 0.9671938105201693, "grad_norm": 0.4960213686814157, "learning_rate": 1.103112583166599e-07, "loss": 0.0483, "step": 12423 }, { "epoch": 0.967271665612379, "grad_norm": 0.44760916489347874, "learning_rate": 1.0978831612182384e-07, "loss": 0.0442, "step": 12424 }, { "epoch": 0.9673495207045886, "grad_norm": 0.4917617621387027, "learning_rate": 1.0926661299368368e-07, "loss": 0.0537, "step": 12425 }, { "epoch": 0.9674273757967982, "grad_norm": 0.36873342838459205, "learning_rate": 1.087461489647379e-07, "loss": 0.0209, "step": 12426 }, { "epoch": 0.9675052308890079, "grad_norm": 0.35920659439447655, "learning_rate": 1.0822692406741164e-07, "loss": 0.0246, "step": 12427 }, { "epoch": 0.9675830859812175, "grad_norm": 0.576656969956111, "learning_rate": 1.0770893833404795e-07, "loss": 0.0682, "step": 12428 }, { "epoch": 0.967660941073427, "grad_norm": 0.44775733180692995, "learning_rate": 1.0719219179691431e-07, "loss": 0.0448, "step": 12429 }, { "epoch": 0.9677387961656367, "grad_norm": 0.4111807039905352, "learning_rate": 1.06676684488205e-07, "loss": 0.0334, "step": 12430 }, { "epoch": 0.9678166512578463, "grad_norm": 0.5173983467889666, "learning_rate": 1.0616241644002767e-07, "loss": 0.0489, "step": 12431 }, { "epoch": 0.9678945063500559, "grad_norm": 0.630074418909264, "learning_rate": 1.0564938768442557e-07, "loss": 0.0634, "step": 12432 }, { "epoch": 0.9679723614422656, "grad_norm": 0.5255108766899133, "learning_rate": 1.0513759825335312e-07, "loss": 0.0522, "step": 12433 }, { "epoch": 0.9680502165344752, "grad_norm": 0.3556225684371515, "learning_rate": 1.0462704817869374e-07, "loss": 0.0278, "step": 12434 }, { "epoch": 0.9681280716266848, "grad_norm": 0.5058382186490984, "learning_rate": 1.041177374922553e-07, "loss": 0.0475, "step": 12435 }, { "epoch": 0.9682059267188945, "grad_norm": 0.4060785456821617, "learning_rate": 1.036096662257613e-07, "loss": 0.0317, "step": 12436 }, { "epoch": 0.9682837818111041, "grad_norm": 0.42064405976779856, "learning_rate": 1.03102834410862e-07, "loss": 0.0326, "step": 12437 }, { "epoch": 0.9683616369033137, "grad_norm": 0.49833875703077407, "learning_rate": 1.0259724207913435e-07, "loss": 0.0388, "step": 12438 }, { "epoch": 0.9684394919955234, "grad_norm": 0.40916272973307183, "learning_rate": 1.0209288926207539e-07, "loss": 0.0295, "step": 12439 }, { "epoch": 0.968517347087733, "grad_norm": 0.4828462045972946, "learning_rate": 1.0158977599109776e-07, "loss": 0.048, "step": 12440 }, { "epoch": 0.9685952021799425, "grad_norm": 0.5106752623086509, "learning_rate": 1.0108790229754528e-07, "loss": 0.0428, "step": 12441 }, { "epoch": 0.9686730572721522, "grad_norm": 0.3501720197699583, "learning_rate": 1.0058726821268405e-07, "loss": 0.0244, "step": 12442 }, { "epoch": 0.9687509123643618, "grad_norm": 0.47527412403144986, "learning_rate": 1.0008787376770245e-07, "loss": 0.0464, "step": 12443 }, { "epoch": 0.9688287674565714, "grad_norm": 0.45524546224672763, "learning_rate": 9.958971899370672e-08, "loss": 0.0307, "step": 12444 }, { "epoch": 0.9689066225487811, "grad_norm": 0.42377003516964584, "learning_rate": 9.909280392173204e-08, "loss": 0.0412, "step": 12445 }, { "epoch": 0.9689844776409907, "grad_norm": 0.44549476587352965, "learning_rate": 9.859712858273362e-08, "loss": 0.0351, "step": 12446 }, { "epoch": 0.9690623327332003, "grad_norm": 0.3376921892758869, "learning_rate": 9.81026930075868e-08, "loss": 0.0162, "step": 12447 }, { "epoch": 0.96914018782541, "grad_norm": 0.541443690571423, "learning_rate": 9.7609497227098e-08, "loss": 0.0589, "step": 12448 }, { "epoch": 0.9692180429176196, "grad_norm": 0.4047044941110064, "learning_rate": 9.711754127198714e-08, "loss": 0.037, "step": 12449 }, { "epoch": 0.9692958980098292, "grad_norm": 0.5496144131927568, "learning_rate": 9.662682517290078e-08, "loss": 0.0483, "step": 12450 }, { "epoch": 0.9692958980098292, "eval_loss": 0.004752133507281542, "eval_runtime": 162.2264, "eval_samples_per_second": 17.753, "eval_steps_per_second": 0.635, "step": 12450 }, { "epoch": 0.9693737531020389, "grad_norm": 0.34676853498894167, "learning_rate": 9.613734896041004e-08, "loss": 0.0226, "step": 12451 }, { "epoch": 0.9694516081942485, "grad_norm": 0.44249391101483804, "learning_rate": 9.564911266500387e-08, "loss": 0.0313, "step": 12452 }, { "epoch": 0.9695294632864581, "grad_norm": 0.3864065034304279, "learning_rate": 9.516211631710237e-08, "loss": 0.0238, "step": 12453 }, { "epoch": 0.9696073183786678, "grad_norm": 0.5151108690497049, "learning_rate": 9.467635994703683e-08, "loss": 0.0518, "step": 12454 }, { "epoch": 0.9696851734708773, "grad_norm": 0.36376812303537653, "learning_rate": 9.419184358507416e-08, "loss": 0.0334, "step": 12455 }, { "epoch": 0.9697630285630869, "grad_norm": 0.5804116129383551, "learning_rate": 9.370856726139244e-08, "loss": 0.0581, "step": 12456 }, { "epoch": 0.9698408836552965, "grad_norm": 0.4538055028887854, "learning_rate": 9.322653100609869e-08, "loss": 0.041, "step": 12457 }, { "epoch": 0.9699187387475062, "grad_norm": 0.5021189398163337, "learning_rate": 9.274573484922223e-08, "loss": 0.0524, "step": 12458 }, { "epoch": 0.9699965938397158, "grad_norm": 0.45535363377944227, "learning_rate": 9.226617882071465e-08, "loss": 0.0413, "step": 12459 }, { "epoch": 0.9700744489319254, "grad_norm": 0.4884524231942227, "learning_rate": 9.178786295044984e-08, "loss": 0.0348, "step": 12460 }, { "epoch": 0.9701523040241351, "grad_norm": 0.3889772755317375, "learning_rate": 9.131078726822395e-08, "loss": 0.0338, "step": 12461 }, { "epoch": 0.9702301591163447, "grad_norm": 0.3633379906973117, "learning_rate": 9.083495180375545e-08, "loss": 0.023, "step": 12462 }, { "epoch": 0.9703080142085543, "grad_norm": 0.40422892471947663, "learning_rate": 9.036035658668951e-08, "loss": 0.0404, "step": 12463 }, { "epoch": 0.970385869300764, "grad_norm": 0.26278260025433314, "learning_rate": 8.988700164658914e-08, "loss": 0.0109, "step": 12464 }, { "epoch": 0.9704637243929736, "grad_norm": 0.37944434084856976, "learning_rate": 8.941488701294409e-08, "loss": 0.029, "step": 12465 }, { "epoch": 0.9705415794851832, "grad_norm": 0.49684064265243494, "learning_rate": 8.894401271516196e-08, "loss": 0.0463, "step": 12466 }, { "epoch": 0.9706194345773929, "grad_norm": 0.5168223319712049, "learning_rate": 8.847437878257926e-08, "loss": 0.0568, "step": 12467 }, { "epoch": 0.9706972896696024, "grad_norm": 0.44825554946550256, "learning_rate": 8.800598524444815e-08, "loss": 0.0405, "step": 12468 }, { "epoch": 0.970775144761812, "grad_norm": 0.3755206759674022, "learning_rate": 8.753883212995195e-08, "loss": 0.0324, "step": 12469 }, { "epoch": 0.9708529998540217, "grad_norm": 0.40477409706994955, "learning_rate": 8.707291946818742e-08, "loss": 0.0317, "step": 12470 }, { "epoch": 0.9709308549462313, "grad_norm": 0.45922754138075234, "learning_rate": 8.66082472881824e-08, "loss": 0.04, "step": 12471 }, { "epoch": 0.9710087100384409, "grad_norm": 0.39667536428202677, "learning_rate": 8.614481561888044e-08, "loss": 0.0372, "step": 12472 }, { "epoch": 0.9710865651306506, "grad_norm": 0.3896417435010161, "learning_rate": 8.56826244891562e-08, "loss": 0.0321, "step": 12473 }, { "epoch": 0.9711644202228602, "grad_norm": 0.49983097530784626, "learning_rate": 8.522167392779779e-08, "loss": 0.0437, "step": 12474 }, { "epoch": 0.9712422753150698, "grad_norm": 0.46578019617901834, "learning_rate": 8.476196396351999e-08, "loss": 0.0471, "step": 12475 }, { "epoch": 0.9713201304072795, "grad_norm": 0.36234597872295565, "learning_rate": 8.430349462496213e-08, "loss": 0.032, "step": 12476 }, { "epoch": 0.9713979854994891, "grad_norm": 0.49642135282911054, "learning_rate": 8.384626594068578e-08, "loss": 0.0478, "step": 12477 }, { "epoch": 0.9714758405916987, "grad_norm": 0.35094320101473697, "learning_rate": 8.339027793917264e-08, "loss": 0.0218, "step": 12478 }, { "epoch": 0.9715536956839084, "grad_norm": 0.3317999161905527, "learning_rate": 8.293553064882886e-08, "loss": 0.0226, "step": 12479 }, { "epoch": 0.971631550776118, "grad_norm": 0.5363688646480902, "learning_rate": 8.24820240979829e-08, "loss": 0.0529, "step": 12480 }, { "epoch": 0.9717094058683275, "grad_norm": 0.3256531478046616, "learning_rate": 8.202975831488546e-08, "loss": 0.022, "step": 12481 }, { "epoch": 0.9717872609605372, "grad_norm": 0.357186968749825, "learning_rate": 8.157873332771182e-08, "loss": 0.0189, "step": 12482 }, { "epoch": 0.9718651160527468, "grad_norm": 0.6251390781570413, "learning_rate": 8.11289491645595e-08, "loss": 0.0553, "step": 12483 }, { "epoch": 0.9719429711449564, "grad_norm": 0.5564250669198615, "learning_rate": 8.068040585344383e-08, "loss": 0.0577, "step": 12484 }, { "epoch": 0.9720208262371661, "grad_norm": 0.4021503237978114, "learning_rate": 8.023310342231361e-08, "loss": 0.0248, "step": 12485 }, { "epoch": 0.9720986813293757, "grad_norm": 0.5469604799051463, "learning_rate": 7.978704189902653e-08, "loss": 0.0419, "step": 12486 }, { "epoch": 0.9721765364215853, "grad_norm": 0.372684123198454, "learning_rate": 7.934222131137592e-08, "loss": 0.0267, "step": 12487 }, { "epoch": 0.972254391513795, "grad_norm": 0.39101337100248673, "learning_rate": 7.889864168706852e-08, "loss": 0.0333, "step": 12488 }, { "epoch": 0.9723322466060046, "grad_norm": 0.38799796818407634, "learning_rate": 7.845630305373775e-08, "loss": 0.0268, "step": 12489 }, { "epoch": 0.9724101016982142, "grad_norm": 0.45387888212913574, "learning_rate": 7.801520543893937e-08, "loss": 0.0437, "step": 12490 }, { "epoch": 0.9724879567904238, "grad_norm": 0.33366528996666756, "learning_rate": 7.757534887015361e-08, "loss": 0.0216, "step": 12491 }, { "epoch": 0.9725658118826335, "grad_norm": 0.48265651078247546, "learning_rate": 7.713673337477856e-08, "loss": 0.0417, "step": 12492 }, { "epoch": 0.9726436669748431, "grad_norm": 0.5239214738285423, "learning_rate": 7.669935898013903e-08, "loss": 0.0472, "step": 12493 }, { "epoch": 0.9727215220670526, "grad_norm": 0.44422271191432855, "learning_rate": 7.626322571348433e-08, "loss": 0.0353, "step": 12494 }, { "epoch": 0.9727993771592623, "grad_norm": 0.4341086293540245, "learning_rate": 7.582833360197939e-08, "loss": 0.0356, "step": 12495 }, { "epoch": 0.9728772322514719, "grad_norm": 0.3586494049471375, "learning_rate": 7.539468267271588e-08, "loss": 0.0194, "step": 12496 }, { "epoch": 0.9729550873436815, "grad_norm": 0.4873574958156872, "learning_rate": 7.496227295271219e-08, "loss": 0.039, "step": 12497 }, { "epoch": 0.9730329424358912, "grad_norm": 0.42015829678759326, "learning_rate": 7.45311044689001e-08, "loss": 0.0243, "step": 12498 }, { "epoch": 0.9731107975281008, "grad_norm": 0.34806674498392964, "learning_rate": 7.41011772481448e-08, "loss": 0.0181, "step": 12499 }, { "epoch": 0.9731886526203104, "grad_norm": 0.37944302375449707, "learning_rate": 7.367249131722487e-08, "loss": 0.0354, "step": 12500 }, { "epoch": 0.9731886526203104, "eval_loss": 0.0047494955360889435, "eval_runtime": 162.6701, "eval_samples_per_second": 17.705, "eval_steps_per_second": 0.633, "step": 12500 }, { "epoch": 0.9732665077125201, "grad_norm": 0.3932687779457946, "learning_rate": 7.324504670284782e-08, "loss": 0.033, "step": 12501 }, { "epoch": 0.9733443628047297, "grad_norm": 0.48685110825899386, "learning_rate": 7.281884343163903e-08, "loss": 0.0553, "step": 12502 }, { "epoch": 0.9734222178969393, "grad_norm": 0.38477957972026283, "learning_rate": 7.239388153015059e-08, "loss": 0.0296, "step": 12503 }, { "epoch": 0.973500072989149, "grad_norm": 0.361543203714553, "learning_rate": 7.197016102485465e-08, "loss": 0.0273, "step": 12504 }, { "epoch": 0.9735779280813586, "grad_norm": 0.33084906171898937, "learning_rate": 7.154768194215011e-08, "loss": 0.0181, "step": 12505 }, { "epoch": 0.9736557831735682, "grad_norm": 0.3922865876574331, "learning_rate": 7.112644430835147e-08, "loss": 0.0303, "step": 12506 }, { "epoch": 0.9737336382657779, "grad_norm": 0.4743285471823101, "learning_rate": 7.070644814970218e-08, "loss": 0.0404, "step": 12507 }, { "epoch": 0.9738114933579874, "grad_norm": 0.4601629968015294, "learning_rate": 7.028769349236354e-08, "loss": 0.0473, "step": 12508 }, { "epoch": 0.973889348450197, "grad_norm": 0.49020097633184645, "learning_rate": 6.987018036242354e-08, "loss": 0.043, "step": 12509 }, { "epoch": 0.9739672035424067, "grad_norm": 0.4024664398397822, "learning_rate": 6.945390878589253e-08, "loss": 0.0269, "step": 12510 }, { "epoch": 0.9740450586346163, "grad_norm": 0.31159267196863477, "learning_rate": 6.903887878870085e-08, "loss": 0.0224, "step": 12511 }, { "epoch": 0.9741229137268259, "grad_norm": 0.4595409496393686, "learning_rate": 6.86250903967034e-08, "loss": 0.0375, "step": 12512 }, { "epoch": 0.9742007688190356, "grad_norm": 0.5050866068247397, "learning_rate": 6.82125436356773e-08, "loss": 0.0491, "step": 12513 }, { "epoch": 0.9742786239112452, "grad_norm": 0.37948505608335414, "learning_rate": 6.780123853132204e-08, "loss": 0.0315, "step": 12514 }, { "epoch": 0.9743564790034548, "grad_norm": 0.5110914056797439, "learning_rate": 6.739117510925929e-08, "loss": 0.0464, "step": 12515 }, { "epoch": 0.9744343340956645, "grad_norm": 0.48376190044296014, "learning_rate": 6.698235339503534e-08, "loss": 0.0371, "step": 12516 }, { "epoch": 0.9745121891878741, "grad_norm": 0.3733619367794381, "learning_rate": 6.657477341411867e-08, "loss": 0.0293, "step": 12517 }, { "epoch": 0.9745900442800837, "grad_norm": 0.565454200669687, "learning_rate": 6.616843519189564e-08, "loss": 0.0619, "step": 12518 }, { "epoch": 0.9746678993722934, "grad_norm": 0.342068591305601, "learning_rate": 6.576333875368379e-08, "loss": 0.0198, "step": 12519 }, { "epoch": 0.974745754464503, "grad_norm": 0.32450801081859837, "learning_rate": 6.535948412471626e-08, "loss": 0.0202, "step": 12520 }, { "epoch": 0.9748236095567125, "grad_norm": 0.4433924404025126, "learning_rate": 6.495687133015072e-08, "loss": 0.0422, "step": 12521 }, { "epoch": 0.9749014646489222, "grad_norm": 0.39034047975647496, "learning_rate": 6.455550039506931e-08, "loss": 0.0278, "step": 12522 }, { "epoch": 0.9749793197411318, "grad_norm": 0.42700832127648114, "learning_rate": 6.415537134447647e-08, "loss": 0.0379, "step": 12523 }, { "epoch": 0.9750571748333414, "grad_norm": 0.4395951791112799, "learning_rate": 6.375648420329672e-08, "loss": 0.0364, "step": 12524 }, { "epoch": 0.9751350299255511, "grad_norm": 0.34429569457281073, "learning_rate": 6.335883899637907e-08, "loss": 0.0277, "step": 12525 }, { "epoch": 0.9752128850177607, "grad_norm": 0.45719428483232577, "learning_rate": 6.296243574849703e-08, "loss": 0.0456, "step": 12526 }, { "epoch": 0.9752907401099703, "grad_norm": 0.5079029898778001, "learning_rate": 6.256727448434197e-08, "loss": 0.0443, "step": 12527 }, { "epoch": 0.9753685952021799, "grad_norm": 0.40930198593636746, "learning_rate": 6.217335522852974e-08, "loss": 0.0451, "step": 12528 }, { "epoch": 0.9754464502943896, "grad_norm": 0.39578781371630833, "learning_rate": 6.178067800560295e-08, "loss": 0.0282, "step": 12529 }, { "epoch": 0.9755243053865992, "grad_norm": 0.5164841235225177, "learning_rate": 6.138924284002201e-08, "loss": 0.0484, "step": 12530 }, { "epoch": 0.9756021604788088, "grad_norm": 0.4337982370718088, "learning_rate": 6.099904975616965e-08, "loss": 0.0339, "step": 12531 }, { "epoch": 0.9756800155710185, "grad_norm": 0.4631466958143197, "learning_rate": 6.061009877835755e-08, "loss": 0.043, "step": 12532 }, { "epoch": 0.975757870663228, "grad_norm": 0.3456002303140778, "learning_rate": 6.022238993081076e-08, "loss": 0.0238, "step": 12533 }, { "epoch": 0.9758357257554376, "grad_norm": 0.5027345777062444, "learning_rate": 5.98359232376855e-08, "loss": 0.0491, "step": 12534 }, { "epoch": 0.9759135808476473, "grad_norm": 0.5271209689085989, "learning_rate": 5.945069872305587e-08, "loss": 0.0534, "step": 12535 }, { "epoch": 0.9759914359398569, "grad_norm": 0.36875512776418923, "learning_rate": 5.906671641091599e-08, "loss": 0.0291, "step": 12536 }, { "epoch": 0.9760692910320665, "grad_norm": 0.39557063205149007, "learning_rate": 5.8683976325191185e-08, "loss": 0.0323, "step": 12537 }, { "epoch": 0.9761471461242762, "grad_norm": 0.47047426728407143, "learning_rate": 5.830247848972237e-08, "loss": 0.0452, "step": 12538 }, { "epoch": 0.9762250012164858, "grad_norm": 0.41750122616041535, "learning_rate": 5.792222292827276e-08, "loss": 0.0319, "step": 12539 }, { "epoch": 0.9763028563086954, "grad_norm": 0.33925958807331463, "learning_rate": 5.754320966453453e-08, "loss": 0.0184, "step": 12540 }, { "epoch": 0.9763807114009051, "grad_norm": 0.3694098607279801, "learning_rate": 5.716543872211544e-08, "loss": 0.023, "step": 12541 }, { "epoch": 0.9764585664931147, "grad_norm": 0.44746087807889307, "learning_rate": 5.678891012455001e-08, "loss": 0.0396, "step": 12542 }, { "epoch": 0.9765364215853243, "grad_norm": 0.45545105733420327, "learning_rate": 5.641362389529503e-08, "loss": 0.0459, "step": 12543 }, { "epoch": 0.976614276677534, "grad_norm": 0.454342912398698, "learning_rate": 5.603958005772736e-08, "loss": 0.0448, "step": 12544 }, { "epoch": 0.9766921317697436, "grad_norm": 0.5178749925310174, "learning_rate": 5.5666778635150574e-08, "loss": 0.0351, "step": 12545 }, { "epoch": 0.9767699868619532, "grad_norm": 0.3389613184050311, "learning_rate": 5.5295219650783884e-08, "loss": 0.023, "step": 12546 }, { "epoch": 0.9768478419541629, "grad_norm": 0.5193258999078615, "learning_rate": 5.492490312777987e-08, "loss": 0.0455, "step": 12547 }, { "epoch": 0.9769256970463724, "grad_norm": 0.36202976893193073, "learning_rate": 5.4555829089202314e-08, "loss": 0.0275, "step": 12548 }, { "epoch": 0.977003552138582, "grad_norm": 0.3959556220803521, "learning_rate": 5.418799755804394e-08, "loss": 0.026, "step": 12549 }, { "epoch": 0.9770814072307917, "grad_norm": 0.36117503858814354, "learning_rate": 5.382140855721973e-08, "loss": 0.0186, "step": 12550 }, { "epoch": 0.9770814072307917, "eval_loss": 0.004745509475469589, "eval_runtime": 162.0759, "eval_samples_per_second": 17.769, "eval_steps_per_second": 0.636, "step": 12550 }, { "epoch": 0.9771592623230013, "grad_norm": 0.5580967603072682, "learning_rate": 5.3456062109569216e-08, "loss": 0.0585, "step": 12551 }, { "epoch": 0.9772371174152109, "grad_norm": 0.42655234589149227, "learning_rate": 5.3091958237847516e-08, "loss": 0.038, "step": 12552 }, { "epoch": 0.9773149725074206, "grad_norm": 0.45703047805637237, "learning_rate": 5.272909696473649e-08, "loss": 0.0369, "step": 12553 }, { "epoch": 0.9773928275996302, "grad_norm": 0.4196031561489269, "learning_rate": 5.236747831284472e-08, "loss": 0.0367, "step": 12554 }, { "epoch": 0.9774706826918398, "grad_norm": 0.4616518721446982, "learning_rate": 5.200710230469419e-08, "loss": 0.0383, "step": 12555 }, { "epoch": 0.9775485377840495, "grad_norm": 0.418701583263822, "learning_rate": 5.164796896274027e-08, "loss": 0.0357, "step": 12556 }, { "epoch": 0.9776263928762591, "grad_norm": 0.5289521941848779, "learning_rate": 5.1290078309351735e-08, "loss": 0.0553, "step": 12557 }, { "epoch": 0.9777042479684687, "grad_norm": 0.32536562967430305, "learning_rate": 5.0933430366821854e-08, "loss": 0.0208, "step": 12558 }, { "epoch": 0.9777821030606784, "grad_norm": 0.29952206994711933, "learning_rate": 5.057802515737287e-08, "loss": 0.0177, "step": 12559 }, { "epoch": 0.977859958152888, "grad_norm": 0.42502018026298705, "learning_rate": 5.022386270314039e-08, "loss": 0.0321, "step": 12560 }, { "epoch": 0.9779378132450975, "grad_norm": 0.5219835675326139, "learning_rate": 4.987094302619122e-08, "loss": 0.0538, "step": 12561 }, { "epoch": 0.9780156683373072, "grad_norm": 0.4483640526654012, "learning_rate": 4.951926614850555e-08, "loss": 0.038, "step": 12562 }, { "epoch": 0.9780935234295168, "grad_norm": 0.4347813685810988, "learning_rate": 4.9168832091996964e-08, "loss": 0.0317, "step": 12563 }, { "epoch": 0.9781713785217264, "grad_norm": 0.41471810411483334, "learning_rate": 4.8819640878492447e-08, "loss": 0.0266, "step": 12564 }, { "epoch": 0.978249233613936, "grad_norm": 0.4404706479868009, "learning_rate": 4.8471692529743484e-08, "loss": 0.0386, "step": 12565 }, { "epoch": 0.9783270887061457, "grad_norm": 0.4601563312933974, "learning_rate": 4.81249870674283e-08, "loss": 0.0401, "step": 12566 }, { "epoch": 0.9784049437983553, "grad_norm": 0.4756570588421975, "learning_rate": 4.777952451314516e-08, "loss": 0.0366, "step": 12567 }, { "epoch": 0.9784827988905649, "grad_norm": 0.5557947051256408, "learning_rate": 4.7435304888414635e-08, "loss": 0.052, "step": 12568 }, { "epoch": 0.9785606539827746, "grad_norm": 0.33983485452967727, "learning_rate": 4.709232821467735e-08, "loss": 0.0234, "step": 12569 }, { "epoch": 0.9786385090749842, "grad_norm": 0.2568466733419521, "learning_rate": 4.675059451330288e-08, "loss": 0.0103, "step": 12570 }, { "epoch": 0.9787163641671938, "grad_norm": 0.48048928126364476, "learning_rate": 4.641010380557642e-08, "loss": 0.0414, "step": 12571 }, { "epoch": 0.9787942192594035, "grad_norm": 0.4140025504413717, "learning_rate": 4.607085611270989e-08, "loss": 0.0308, "step": 12572 }, { "epoch": 0.978872074351613, "grad_norm": 0.42040172894076944, "learning_rate": 4.573285145583972e-08, "loss": 0.0245, "step": 12573 }, { "epoch": 0.9789499294438226, "grad_norm": 0.42242270304349644, "learning_rate": 4.539608985601795e-08, "loss": 0.0358, "step": 12574 }, { "epoch": 0.9790277845360323, "grad_norm": 0.5861857109732428, "learning_rate": 4.506057133422559e-08, "loss": 0.0532, "step": 12575 }, { "epoch": 0.9791056396282419, "grad_norm": 0.35968025163454315, "learning_rate": 4.4726295911361464e-08, "loss": 0.0302, "step": 12576 }, { "epoch": 0.9791834947204515, "grad_norm": 0.46057359840608025, "learning_rate": 4.4393263608253354e-08, "loss": 0.0417, "step": 12577 }, { "epoch": 0.9792613498126612, "grad_norm": 0.4681727412248754, "learning_rate": 4.406147444564246e-08, "loss": 0.0407, "step": 12578 }, { "epoch": 0.9793392049048708, "grad_norm": 0.515864589641656, "learning_rate": 4.373092844420335e-08, "loss": 0.0515, "step": 12579 }, { "epoch": 0.9794170599970804, "grad_norm": 0.35591868638316604, "learning_rate": 4.340162562452177e-08, "loss": 0.0266, "step": 12580 }, { "epoch": 0.9794949150892901, "grad_norm": 0.42017367256474036, "learning_rate": 4.3073566007116876e-08, "loss": 0.0199, "step": 12581 }, { "epoch": 0.9795727701814997, "grad_norm": 0.3639883471253532, "learning_rate": 4.2746749612423424e-08, "loss": 0.0255, "step": 12582 }, { "epoch": 0.9796506252737093, "grad_norm": 0.30361367641665793, "learning_rate": 4.2421176460798466e-08, "loss": 0.013, "step": 12583 }, { "epoch": 0.979728480365919, "grad_norm": 0.4589947988185595, "learning_rate": 4.2096846572525775e-08, "loss": 0.0383, "step": 12584 }, { "epoch": 0.9798063354581286, "grad_norm": 0.42747882104946033, "learning_rate": 4.177375996780919e-08, "loss": 0.0311, "step": 12585 }, { "epoch": 0.9798841905503382, "grad_norm": 0.3623553111963271, "learning_rate": 4.145191666677484e-08, "loss": 0.0203, "step": 12586 }, { "epoch": 0.9799620456425479, "grad_norm": 0.4675551582191596, "learning_rate": 4.1131316689473345e-08, "loss": 0.0384, "step": 12587 }, { "epoch": 0.9800399007347574, "grad_norm": 0.5968692625277658, "learning_rate": 4.08119600558754e-08, "loss": 0.0688, "step": 12588 }, { "epoch": 0.980117755826967, "grad_norm": 0.40258672212057095, "learning_rate": 4.04938467858762e-08, "loss": 0.0244, "step": 12589 }, { "epoch": 0.9801956109191767, "grad_norm": 0.36996864516423433, "learning_rate": 4.017697689929323e-08, "loss": 0.0236, "step": 12590 }, { "epoch": 0.9802734660113863, "grad_norm": 0.5634258810130879, "learning_rate": 3.986135041586403e-08, "loss": 0.0586, "step": 12591 }, { "epoch": 0.9803513211035959, "grad_norm": 0.45388309008508776, "learning_rate": 3.9546967355252874e-08, "loss": 0.0375, "step": 12592 }, { "epoch": 0.9804291761958056, "grad_norm": 0.40359128679692874, "learning_rate": 3.9233827737041875e-08, "loss": 0.0342, "step": 12593 }, { "epoch": 0.9805070312880152, "grad_norm": 0.5187174531182392, "learning_rate": 3.892193158073987e-08, "loss": 0.0402, "step": 12594 }, { "epoch": 0.9805848863802248, "grad_norm": 0.44583960010349993, "learning_rate": 3.8611278905775764e-08, "loss": 0.0312, "step": 12595 }, { "epoch": 0.9806627414724345, "grad_norm": 0.45790521730803196, "learning_rate": 3.830186973150296e-08, "loss": 0.0453, "step": 12596 }, { "epoch": 0.9807405965646441, "grad_norm": 0.4325015060128421, "learning_rate": 3.799370407719494e-08, "loss": 0.0487, "step": 12597 }, { "epoch": 0.9808184516568537, "grad_norm": 0.38126099959872106, "learning_rate": 3.768678196204967e-08, "loss": 0.0294, "step": 12598 }, { "epoch": 0.9808963067490633, "grad_norm": 0.3846199648139914, "learning_rate": 3.7381103405187414e-08, "loss": 0.0321, "step": 12599 }, { "epoch": 0.980974161841273, "grad_norm": 0.35364039896165483, "learning_rate": 3.70766684256485e-08, "loss": 0.0271, "step": 12600 }, { "epoch": 0.980974161841273, "eval_loss": 0.004745341371744871, "eval_runtime": 162.3879, "eval_samples_per_second": 17.735, "eval_steps_per_second": 0.634, "step": 12600 }, { "epoch": 0.9810520169334825, "grad_norm": 0.44736532595517836, "learning_rate": 3.677347704239997e-08, "loss": 0.0319, "step": 12601 }, { "epoch": 0.9811298720256921, "grad_norm": 0.38253105427177686, "learning_rate": 3.6471529274328954e-08, "loss": 0.0372, "step": 12602 }, { "epoch": 0.9812077271179018, "grad_norm": 0.3673917412707235, "learning_rate": 3.617082514024262e-08, "loss": 0.0263, "step": 12603 }, { "epoch": 0.9812855822101114, "grad_norm": 0.49922249969617727, "learning_rate": 3.5871364658877086e-08, "loss": 0.0445, "step": 12604 }, { "epoch": 0.981363437302321, "grad_norm": 0.49353998734073545, "learning_rate": 3.557314784888632e-08, "loss": 0.0527, "step": 12605 }, { "epoch": 0.9814412923945307, "grad_norm": 0.3202642452334792, "learning_rate": 3.527617472884659e-08, "loss": 0.0192, "step": 12606 }, { "epoch": 0.9815191474867403, "grad_norm": 0.586538329710281, "learning_rate": 3.498044531725864e-08, "loss": 0.0596, "step": 12607 }, { "epoch": 0.9815970025789499, "grad_norm": 0.37404904030353175, "learning_rate": 3.4685959632545505e-08, "loss": 0.0282, "step": 12608 }, { "epoch": 0.9816748576711596, "grad_norm": 0.45726221968553626, "learning_rate": 3.439271769305252e-08, "loss": 0.0337, "step": 12609 }, { "epoch": 0.9817527127633692, "grad_norm": 0.4072998797674296, "learning_rate": 3.410071951704508e-08, "loss": 0.0345, "step": 12610 }, { "epoch": 0.9818305678555788, "grad_norm": 0.496663671987084, "learning_rate": 3.380996512271528e-08, "loss": 0.0459, "step": 12611 }, { "epoch": 0.9819084229477885, "grad_norm": 0.41197696142107726, "learning_rate": 3.352045452817532e-08, "loss": 0.0351, "step": 12612 }, { "epoch": 0.981986278039998, "grad_norm": 0.44338048237532124, "learning_rate": 3.3232187751459645e-08, "loss": 0.0361, "step": 12613 }, { "epoch": 0.9820641331322076, "grad_norm": 0.576263916005108, "learning_rate": 3.294516481052723e-08, "loss": 0.0565, "step": 12614 }, { "epoch": 0.9821419882244173, "grad_norm": 0.43689707641176834, "learning_rate": 3.2659385723259327e-08, "loss": 0.0391, "step": 12615 }, { "epoch": 0.9822198433166269, "grad_norm": 0.4703362102021272, "learning_rate": 3.2374850507452815e-08, "loss": 0.0465, "step": 12616 }, { "epoch": 0.9822976984088365, "grad_norm": 0.41767496703518553, "learning_rate": 3.209155918084017e-08, "loss": 0.0294, "step": 12617 }, { "epoch": 0.9823755535010462, "grad_norm": 0.2316994890264306, "learning_rate": 3.180951176106506e-08, "loss": 0.0095, "step": 12618 }, { "epoch": 0.9824534085932558, "grad_norm": 0.5265442232181675, "learning_rate": 3.152870826570009e-08, "loss": 0.0544, "step": 12619 }, { "epoch": 0.9825312636854654, "grad_norm": 0.5561045756117053, "learning_rate": 3.12491487122335e-08, "loss": 0.0478, "step": 12620 }, { "epoch": 0.9826091187776751, "grad_norm": 0.4144923095526991, "learning_rate": 3.097083311808469e-08, "loss": 0.0391, "step": 12621 }, { "epoch": 0.9826869738698847, "grad_norm": 0.4236094116129578, "learning_rate": 3.069376150059089e-08, "loss": 0.0399, "step": 12622 }, { "epoch": 0.9827648289620943, "grad_norm": 0.4330215758625795, "learning_rate": 3.0417933877011644e-08, "loss": 0.0347, "step": 12623 }, { "epoch": 0.982842684054304, "grad_norm": 0.40597073743801226, "learning_rate": 3.014335026453097e-08, "loss": 0.0214, "step": 12624 }, { "epoch": 0.9829205391465136, "grad_norm": 0.37482555541652357, "learning_rate": 2.987001068025075e-08, "loss": 0.028, "step": 12625 }, { "epoch": 0.9829983942387231, "grad_norm": 0.5216280532808899, "learning_rate": 2.9597915141201806e-08, "loss": 0.044, "step": 12626 }, { "epoch": 0.9830762493309328, "grad_norm": 0.3888921373373316, "learning_rate": 2.93270636643328e-08, "loss": 0.0221, "step": 12627 }, { "epoch": 0.9831541044231424, "grad_norm": 0.37312460975083744, "learning_rate": 2.9057456266519125e-08, "loss": 0.0233, "step": 12628 }, { "epoch": 0.983231959515352, "grad_norm": 0.34867244291199045, "learning_rate": 2.878909296455179e-08, "loss": 0.0299, "step": 12629 }, { "epoch": 0.9833098146075617, "grad_norm": 0.5746674588945856, "learning_rate": 2.8521973775152977e-08, "loss": 0.0496, "step": 12630 }, { "epoch": 0.9833876696997713, "grad_norm": 0.43007754322073194, "learning_rate": 2.825609871496049e-08, "loss": 0.0354, "step": 12631 }, { "epoch": 0.9834655247919809, "grad_norm": 0.3607608027367252, "learning_rate": 2.799146780053885e-08, "loss": 0.0198, "step": 12632 }, { "epoch": 0.9835433798841906, "grad_norm": 0.3143922088076179, "learning_rate": 2.7728081048370436e-08, "loss": 0.0178, "step": 12633 }, { "epoch": 0.9836212349764002, "grad_norm": 0.38375457560864584, "learning_rate": 2.746593847486434e-08, "loss": 0.031, "step": 12634 }, { "epoch": 0.9836990900686098, "grad_norm": 0.42339960202821725, "learning_rate": 2.7205040096351944e-08, "loss": 0.034, "step": 12635 }, { "epoch": 0.9837769451608194, "grad_norm": 0.3850168096199712, "learning_rate": 2.694538592908469e-08, "loss": 0.0287, "step": 12636 }, { "epoch": 0.9838548002530291, "grad_norm": 0.3553549028411916, "learning_rate": 2.6686975989236307e-08, "loss": 0.022, "step": 12637 }, { "epoch": 0.9839326553452387, "grad_norm": 0.4235188840616903, "learning_rate": 2.6429810292909474e-08, "loss": 0.0431, "step": 12638 }, { "epoch": 0.9840105104374482, "grad_norm": 0.4234544159773839, "learning_rate": 2.6173888856120265e-08, "loss": 0.0294, "step": 12639 }, { "epoch": 0.984088365529658, "grad_norm": 0.3799221878279822, "learning_rate": 2.5919211694813705e-08, "loss": 0.0275, "step": 12640 }, { "epoch": 0.9841662206218675, "grad_norm": 0.44414532749514823, "learning_rate": 2.566577882485044e-08, "loss": 0.0523, "step": 12641 }, { "epoch": 0.9842440757140771, "grad_norm": 0.47955234049466455, "learning_rate": 2.54135902620245e-08, "loss": 0.045, "step": 12642 }, { "epoch": 0.9843219308062868, "grad_norm": 0.38178398832011895, "learning_rate": 2.5162646022041106e-08, "loss": 0.024, "step": 12643 }, { "epoch": 0.9843997858984964, "grad_norm": 0.3844445084983208, "learning_rate": 2.491294612053663e-08, "loss": 0.0259, "step": 12644 }, { "epoch": 0.984477640990706, "grad_norm": 0.46021070668361125, "learning_rate": 2.466449057306308e-08, "loss": 0.0363, "step": 12645 }, { "epoch": 0.9845554960829157, "grad_norm": 0.3826692703643969, "learning_rate": 2.4417279395101413e-08, "loss": 0.0263, "step": 12646 }, { "epoch": 0.9846333511751253, "grad_norm": 0.4104388251589792, "learning_rate": 2.417131260204819e-08, "loss": 0.0393, "step": 12647 }, { "epoch": 0.9847112062673349, "grad_norm": 0.4787862716103341, "learning_rate": 2.3926590209226718e-08, "loss": 0.0547, "step": 12648 }, { "epoch": 0.9847890613595446, "grad_norm": 0.3564276323795203, "learning_rate": 2.36831122318848e-08, "loss": 0.0224, "step": 12649 }, { "epoch": 0.9848669164517542, "grad_norm": 0.3801873401865779, "learning_rate": 2.3440878685185854e-08, "loss": 0.0337, "step": 12650 }, { "epoch": 0.9848669164517542, "eval_loss": 0.004745970480144024, "eval_runtime": 162.8062, "eval_samples_per_second": 17.69, "eval_steps_per_second": 0.633, "step": 12650 }, { "epoch": 0.9849447715439638, "grad_norm": 0.284696031022494, "learning_rate": 2.319988958422448e-08, "loss": 0.0152, "step": 12651 }, { "epoch": 0.9850226266361735, "grad_norm": 0.3210447579705444, "learning_rate": 2.2960144944008665e-08, "loss": 0.0184, "step": 12652 }, { "epoch": 0.985100481728383, "grad_norm": 0.41450675470567283, "learning_rate": 2.2721644779475362e-08, "loss": 0.0286, "step": 12653 }, { "epoch": 0.9851783368205926, "grad_norm": 0.3752233892651793, "learning_rate": 2.2484389105483785e-08, "loss": 0.0291, "step": 12654 }, { "epoch": 0.9852561919128023, "grad_norm": 0.5941619032728831, "learning_rate": 2.2248377936811004e-08, "loss": 0.0694, "step": 12655 }, { "epoch": 0.9853340470050119, "grad_norm": 0.22387742268868338, "learning_rate": 2.201361128816082e-08, "loss": 0.0128, "step": 12656 }, { "epoch": 0.9854119020972215, "grad_norm": 0.4569123834888096, "learning_rate": 2.1780089174157083e-08, "loss": 0.0438, "step": 12657 }, { "epoch": 0.9854897571894312, "grad_norm": 0.436933669650826, "learning_rate": 2.154781160935038e-08, "loss": 0.0405, "step": 12658 }, { "epoch": 0.9855676122816408, "grad_norm": 0.4669689143240639, "learning_rate": 2.1316778608204692e-08, "loss": 0.0448, "step": 12659 }, { "epoch": 0.9856454673738504, "grad_norm": 0.3550587516246158, "learning_rate": 2.1086990185117395e-08, "loss": 0.0215, "step": 12660 }, { "epoch": 0.9857233224660601, "grad_norm": 0.40957062977912184, "learning_rate": 2.0858446354401484e-08, "loss": 0.0337, "step": 12661 }, { "epoch": 0.9858011775582697, "grad_norm": 0.3854980306431844, "learning_rate": 2.063114713029446e-08, "loss": 0.0262, "step": 12662 }, { "epoch": 0.9858790326504793, "grad_norm": 0.521765545918422, "learning_rate": 2.0405092526953883e-08, "loss": 0.0595, "step": 12663 }, { "epoch": 0.985956887742689, "grad_norm": 0.3574864779531426, "learning_rate": 2.018028255846405e-08, "loss": 0.0341, "step": 12664 }, { "epoch": 0.9860347428348986, "grad_norm": 0.41247872662041885, "learning_rate": 1.9956717238831524e-08, "loss": 0.0325, "step": 12665 }, { "epoch": 0.9861125979271081, "grad_norm": 0.5420090327477265, "learning_rate": 1.973439658197851e-08, "loss": 0.0492, "step": 12666 }, { "epoch": 0.9861904530193178, "grad_norm": 0.4506356262990234, "learning_rate": 1.9513320601758368e-08, "loss": 0.0283, "step": 12667 }, { "epoch": 0.9862683081115274, "grad_norm": 0.49401840163462496, "learning_rate": 1.9293489311940082e-08, "loss": 0.0576, "step": 12668 }, { "epoch": 0.986346163203737, "grad_norm": 0.34644323237566815, "learning_rate": 1.907490272622159e-08, "loss": 0.022, "step": 12669 }, { "epoch": 0.9864240182959466, "grad_norm": 0.5230101066486617, "learning_rate": 1.885756085821644e-08, "loss": 0.053, "step": 12670 }, { "epoch": 0.9865018733881563, "grad_norm": 0.5285937050237132, "learning_rate": 1.8641463721467135e-08, "loss": 0.0451, "step": 12671 }, { "epoch": 0.9865797284803659, "grad_norm": 0.5178178830916506, "learning_rate": 1.8426611329434018e-08, "loss": 0.0512, "step": 12672 }, { "epoch": 0.9866575835725755, "grad_norm": 0.6609632042124081, "learning_rate": 1.821300369549972e-08, "loss": 0.0663, "step": 12673 }, { "epoch": 0.9867354386647852, "grad_norm": 0.6414369887856229, "learning_rate": 1.800064083297359e-08, "loss": 0.074, "step": 12674 }, { "epoch": 0.9868132937569948, "grad_norm": 0.2639316695972078, "learning_rate": 1.7789522755085055e-08, "loss": 0.0127, "step": 12675 }, { "epoch": 0.9868911488492044, "grad_norm": 0.44675002029337013, "learning_rate": 1.7579649474981365e-08, "loss": 0.0365, "step": 12676 }, { "epoch": 0.9869690039414141, "grad_norm": 0.4672669328858028, "learning_rate": 1.7371021005743172e-08, "loss": 0.041, "step": 12677 }, { "epoch": 0.9870468590336237, "grad_norm": 0.37931292677610967, "learning_rate": 1.7163637360362307e-08, "loss": 0.0208, "step": 12678 }, { "epoch": 0.9871247141258332, "grad_norm": 0.4389953420570026, "learning_rate": 1.6957498551759542e-08, "loss": 0.0301, "step": 12679 }, { "epoch": 0.987202569218043, "grad_norm": 0.4602880305498582, "learning_rate": 1.6752604592775723e-08, "loss": 0.041, "step": 12680 }, { "epoch": 0.9872804243102525, "grad_norm": 0.3669488462101687, "learning_rate": 1.6548955496173968e-08, "loss": 0.041, "step": 12681 }, { "epoch": 0.9873582794024621, "grad_norm": 0.4789065666166609, "learning_rate": 1.634655127464413e-08, "loss": 0.0503, "step": 12682 }, { "epoch": 0.9874361344946718, "grad_norm": 0.4156976841617675, "learning_rate": 1.6145391940789458e-08, "loss": 0.0419, "step": 12683 }, { "epoch": 0.9875139895868814, "grad_norm": 0.3812065972528646, "learning_rate": 1.594547750714659e-08, "loss": 0.0336, "step": 12684 }, { "epoch": 0.987591844679091, "grad_norm": 0.4230332669486754, "learning_rate": 1.574680798616779e-08, "loss": 0.03, "step": 12685 }, { "epoch": 0.9876696997713007, "grad_norm": 0.2866689118301033, "learning_rate": 1.55493833902276e-08, "loss": 0.0186, "step": 12686 }, { "epoch": 0.9877475548635103, "grad_norm": 0.48673182756034483, "learning_rate": 1.5353203731625076e-08, "loss": 0.0577, "step": 12687 }, { "epoch": 0.9878254099557199, "grad_norm": 0.41880637905159884, "learning_rate": 1.515826902258377e-08, "loss": 0.0345, "step": 12688 }, { "epoch": 0.9879032650479296, "grad_norm": 0.3981613084747944, "learning_rate": 1.4964579275242863e-08, "loss": 0.0324, "step": 12689 }, { "epoch": 0.9879811201401392, "grad_norm": 0.3922696815922673, "learning_rate": 1.4772134501672696e-08, "loss": 0.0356, "step": 12690 }, { "epoch": 0.9880589752323488, "grad_norm": 0.339482533803074, "learning_rate": 1.458093471385924e-08, "loss": 0.023, "step": 12691 }, { "epoch": 0.9881368303245585, "grad_norm": 0.4498583989249719, "learning_rate": 1.4390979923712967e-08, "loss": 0.0427, "step": 12692 }, { "epoch": 0.988214685416768, "grad_norm": 0.48357191586731524, "learning_rate": 1.4202270143068853e-08, "loss": 0.0376, "step": 12693 }, { "epoch": 0.9882925405089776, "grad_norm": 0.3240292765744323, "learning_rate": 1.4014805383679719e-08, "loss": 0.0268, "step": 12694 }, { "epoch": 0.9883703956011873, "grad_norm": 0.4313286281591413, "learning_rate": 1.3828585657227334e-08, "loss": 0.0359, "step": 12695 }, { "epoch": 0.9884482506933969, "grad_norm": 0.42394257073250774, "learning_rate": 1.3643610975311305e-08, "loss": 0.0316, "step": 12696 }, { "epoch": 0.9885261057856065, "grad_norm": 0.40943329369033316, "learning_rate": 1.345988134945353e-08, "loss": 0.0282, "step": 12697 }, { "epoch": 0.9886039608778162, "grad_norm": 0.35729502861726153, "learning_rate": 1.3277396791100405e-08, "loss": 0.0206, "step": 12698 }, { "epoch": 0.9886818159700258, "grad_norm": 0.31881497969867817, "learning_rate": 1.3096157311618396e-08, "loss": 0.0164, "step": 12699 }, { "epoch": 0.9887596710622354, "grad_norm": 0.5177264120143689, "learning_rate": 1.291616292230069e-08, "loss": 0.0485, "step": 12700 }, { "epoch": 0.9887596710622354, "eval_loss": 0.004746599588543177, "eval_runtime": 162.2134, "eval_samples_per_second": 17.754, "eval_steps_per_second": 0.635, "step": 12700 }, { "epoch": 0.9888375261544451, "grad_norm": 0.3652290874344488, "learning_rate": 1.273741363435832e-08, "loss": 0.0305, "step": 12701 }, { "epoch": 0.9889153812466547, "grad_norm": 0.3904443706902098, "learning_rate": 1.2559909458926822e-08, "loss": 0.026, "step": 12702 }, { "epoch": 0.9889932363388643, "grad_norm": 0.4946545013483937, "learning_rate": 1.2383650407061798e-08, "loss": 0.0397, "step": 12703 }, { "epoch": 0.989071091431074, "grad_norm": 0.4014954225686248, "learning_rate": 1.2208636489745573e-08, "loss": 0.0281, "step": 12704 }, { "epoch": 0.9891489465232836, "grad_norm": 0.4053179372819325, "learning_rate": 1.2034867717882759e-08, "loss": 0.0286, "step": 12705 }, { "epoch": 0.9892268016154931, "grad_norm": 0.40464785296904165, "learning_rate": 1.1862344102293588e-08, "loss": 0.0309, "step": 12706 }, { "epoch": 0.9893046567077027, "grad_norm": 0.2394228653324462, "learning_rate": 1.169106565372946e-08, "loss": 0.0094, "step": 12707 }, { "epoch": 0.9893825117999124, "grad_norm": 0.31876786888703185, "learning_rate": 1.15210323828574e-08, "loss": 0.0212, "step": 12708 }, { "epoch": 0.989460366892122, "grad_norm": 0.37815834514555713, "learning_rate": 1.1352244300271153e-08, "loss": 0.0331, "step": 12709 }, { "epoch": 0.9895382219843316, "grad_norm": 0.4424368356776677, "learning_rate": 1.1184701416484533e-08, "loss": 0.0332, "step": 12710 }, { "epoch": 0.9896160770765413, "grad_norm": 0.4542801360558548, "learning_rate": 1.1018403741935856e-08, "loss": 0.0423, "step": 12711 }, { "epoch": 0.9896939321687509, "grad_norm": 0.3748582864559831, "learning_rate": 1.0853351286985725e-08, "loss": 0.0207, "step": 12712 }, { "epoch": 0.9897717872609605, "grad_norm": 0.43135832044315153, "learning_rate": 1.0689544061912582e-08, "loss": 0.032, "step": 12713 }, { "epoch": 0.9898496423531702, "grad_norm": 0.4412452839509144, "learning_rate": 1.05269820769216e-08, "loss": 0.0372, "step": 12714 }, { "epoch": 0.9899274974453798, "grad_norm": 0.5187832066468895, "learning_rate": 1.0365665342142451e-08, "loss": 0.046, "step": 12715 }, { "epoch": 0.9900053525375894, "grad_norm": 0.4963625463134094, "learning_rate": 1.0205593867624874e-08, "loss": 0.0437, "step": 12716 }, { "epoch": 0.9900832076297991, "grad_norm": 0.517883840817611, "learning_rate": 1.0046767663336454e-08, "loss": 0.0302, "step": 12717 }, { "epoch": 0.9901610627220087, "grad_norm": 0.37600904110368255, "learning_rate": 9.889186739171497e-09, "loss": 0.0275, "step": 12718 }, { "epoch": 0.9902389178142182, "grad_norm": 0.37187027389547805, "learning_rate": 9.732851104951036e-09, "loss": 0.0351, "step": 12719 }, { "epoch": 0.990316772906428, "grad_norm": 0.3991114673580084, "learning_rate": 9.577760770409506e-09, "loss": 0.0335, "step": 12720 }, { "epoch": 0.9903946279986375, "grad_norm": 0.3981119962103647, "learning_rate": 9.423915745212508e-09, "loss": 0.0275, "step": 12721 }, { "epoch": 0.9904724830908471, "grad_norm": 0.4301632313298277, "learning_rate": 9.271316038939048e-09, "loss": 0.0311, "step": 12722 }, { "epoch": 0.9905503381830568, "grad_norm": 0.3042852268672106, "learning_rate": 9.119961661099297e-09, "loss": 0.0176, "step": 12723 }, { "epoch": 0.9906281932752664, "grad_norm": 0.4281920884980958, "learning_rate": 8.969852621121267e-09, "loss": 0.0393, "step": 12724 }, { "epoch": 0.990706048367476, "grad_norm": 0.3920677337838353, "learning_rate": 8.820988928353036e-09, "loss": 0.0381, "step": 12725 }, { "epoch": 0.9907839034596857, "grad_norm": 0.40481674370757137, "learning_rate": 8.673370592071628e-09, "loss": 0.0345, "step": 12726 }, { "epoch": 0.9908617585518953, "grad_norm": 0.4182165087706597, "learning_rate": 8.526997621471911e-09, "loss": 0.0386, "step": 12727 }, { "epoch": 0.9909396136441049, "grad_norm": 0.5410108953443054, "learning_rate": 8.381870025673256e-09, "loss": 0.0444, "step": 12728 }, { "epoch": 0.9910174687363146, "grad_norm": 0.4664077089392103, "learning_rate": 8.237987813712878e-09, "loss": 0.0374, "step": 12729 }, { "epoch": 0.9910953238285242, "grad_norm": 0.4073998683157468, "learning_rate": 8.095350994556938e-09, "loss": 0.0357, "step": 12730 }, { "epoch": 0.9911731789207338, "grad_norm": 0.36773042768584735, "learning_rate": 7.953959577089443e-09, "loss": 0.0257, "step": 12731 }, { "epoch": 0.9912510340129435, "grad_norm": 0.3366387004884596, "learning_rate": 7.813813570121121e-09, "loss": 0.0193, "step": 12732 }, { "epoch": 0.991328889105153, "grad_norm": 0.4135522328336599, "learning_rate": 7.674912982380545e-09, "loss": 0.0359, "step": 12733 }, { "epoch": 0.9914067441973626, "grad_norm": 0.35576428724528064, "learning_rate": 7.537257822520794e-09, "loss": 0.023, "step": 12734 }, { "epoch": 0.9914845992895723, "grad_norm": 0.35660776248311005, "learning_rate": 7.40084809911501e-09, "loss": 0.0209, "step": 12735 }, { "epoch": 0.9915624543817819, "grad_norm": 0.4378165519759861, "learning_rate": 7.265683820665282e-09, "loss": 0.0313, "step": 12736 }, { "epoch": 0.9916403094739915, "grad_norm": 0.2896134623138445, "learning_rate": 7.131764995587098e-09, "loss": 0.0173, "step": 12737 }, { "epoch": 0.9917181645662012, "grad_norm": 0.4470728498942783, "learning_rate": 6.999091632227118e-09, "loss": 0.0328, "step": 12738 }, { "epoch": 0.9917960196584108, "grad_norm": 0.38746346528268344, "learning_rate": 6.867663738847619e-09, "loss": 0.0246, "step": 12739 }, { "epoch": 0.9918738747506204, "grad_norm": 0.47270416831807155, "learning_rate": 6.737481323635386e-09, "loss": 0.0355, "step": 12740 }, { "epoch": 0.9919517298428301, "grad_norm": 0.5317704224493628, "learning_rate": 6.608544394701711e-09, "loss": 0.0515, "step": 12741 }, { "epoch": 0.9920295849350397, "grad_norm": 0.29539379012383943, "learning_rate": 6.480852960080164e-09, "loss": 0.0156, "step": 12742 }, { "epoch": 0.9921074400272493, "grad_norm": 0.3247351640123715, "learning_rate": 6.354407027722165e-09, "loss": 0.0156, "step": 12743 }, { "epoch": 0.9921852951194589, "grad_norm": 0.5090250833676717, "learning_rate": 6.229206605508076e-09, "loss": 0.0466, "step": 12744 }, { "epoch": 0.9922631502116686, "grad_norm": 0.6370288609229058, "learning_rate": 6.105251701233883e-09, "loss": 0.0836, "step": 12745 }, { "epoch": 0.9923410053038781, "grad_norm": 0.35903305350235953, "learning_rate": 5.982542322622298e-09, "loss": 0.0302, "step": 12746 }, { "epoch": 0.9924188603960877, "grad_norm": 0.37728547542305363, "learning_rate": 5.861078477320536e-09, "loss": 0.0266, "step": 12747 }, { "epoch": 0.9924967154882974, "grad_norm": 0.43054669301291215, "learning_rate": 5.7408601728914385e-09, "loss": 0.0448, "step": 12748 }, { "epoch": 0.992574570580507, "grad_norm": 0.4157072261462685, "learning_rate": 5.621887416824568e-09, "loss": 0.034, "step": 12749 }, { "epoch": 0.9926524256727166, "grad_norm": 0.40569097414713146, "learning_rate": 5.5041602165317734e-09, "loss": 0.0315, "step": 12750 }, { "epoch": 0.9926524256727166, "eval_loss": 0.004744872450828552, "eval_runtime": 162.2304, "eval_samples_per_second": 17.753, "eval_steps_per_second": 0.635, "step": 12750 }, { "epoch": 0.9927302807649263, "grad_norm": 0.3491028697679273, "learning_rate": 5.387678579349409e-09, "loss": 0.0285, "step": 12751 }, { "epoch": 0.9928081358571359, "grad_norm": 0.45559753243885853, "learning_rate": 5.272442512529452e-09, "loss": 0.0389, "step": 12752 }, { "epoch": 0.9928859909493455, "grad_norm": 0.5403429857621016, "learning_rate": 5.158452023255045e-09, "loss": 0.0629, "step": 12753 }, { "epoch": 0.9929638460415552, "grad_norm": 0.38893328983014264, "learning_rate": 5.045707118624954e-09, "loss": 0.0306, "step": 12754 }, { "epoch": 0.9930417011337648, "grad_norm": 0.6487205890206021, "learning_rate": 4.93420780566245e-09, "loss": 0.065, "step": 12755 }, { "epoch": 0.9931195562259744, "grad_norm": 0.2352078050930559, "learning_rate": 4.8239540913130875e-09, "loss": 0.0085, "step": 12756 }, { "epoch": 0.9931974113181841, "grad_norm": 0.4239705456559346, "learning_rate": 4.7149459824447075e-09, "loss": 0.0317, "step": 12757 }, { "epoch": 0.9932752664103937, "grad_norm": 0.5927194886981473, "learning_rate": 4.607183485849653e-09, "loss": 0.0562, "step": 12758 }, { "epoch": 0.9933531215026032, "grad_norm": 0.43523967526796775, "learning_rate": 4.500666608242554e-09, "loss": 0.04, "step": 12759 }, { "epoch": 0.9934309765948129, "grad_norm": 0.2758962771409454, "learning_rate": 4.3953953562536626e-09, "loss": 0.0137, "step": 12760 }, { "epoch": 0.9935088316870225, "grad_norm": 0.42582318071823233, "learning_rate": 4.291369736446616e-09, "loss": 0.0257, "step": 12761 }, { "epoch": 0.9935866867792321, "grad_norm": 0.30286241913795464, "learning_rate": 4.188589755296235e-09, "loss": 0.0215, "step": 12762 }, { "epoch": 0.9936645418714418, "grad_norm": 0.4787907111433272, "learning_rate": 4.087055419210728e-09, "loss": 0.0481, "step": 12763 }, { "epoch": 0.9937423969636514, "grad_norm": 0.35478887123192865, "learning_rate": 3.9867667345117044e-09, "loss": 0.0242, "step": 12764 }, { "epoch": 0.993820252055861, "grad_norm": 0.4161876535493, "learning_rate": 3.887723707447499e-09, "loss": 0.0363, "step": 12765 }, { "epoch": 0.9938981071480707, "grad_norm": 0.4343659698574077, "learning_rate": 3.789926344188733e-09, "loss": 0.0463, "step": 12766 }, { "epoch": 0.9939759622402803, "grad_norm": 0.3993035584272981, "learning_rate": 3.693374650826087e-09, "loss": 0.038, "step": 12767 }, { "epoch": 0.9940538173324899, "grad_norm": 0.3509597777182187, "learning_rate": 3.5980686333747518e-09, "loss": 0.0271, "step": 12768 }, { "epoch": 0.9941316724246996, "grad_norm": 0.36825096519853184, "learning_rate": 3.5040082977744193e-09, "loss": 0.0259, "step": 12769 }, { "epoch": 0.9942095275169092, "grad_norm": 0.4987837761072322, "learning_rate": 3.4111936498804067e-09, "loss": 0.0497, "step": 12770 }, { "epoch": 0.9942873826091188, "grad_norm": 0.5212506933347916, "learning_rate": 3.319624695476975e-09, "loss": 0.0508, "step": 12771 }, { "epoch": 0.9943652377013285, "grad_norm": 0.40962960542856264, "learning_rate": 3.229301440270671e-09, "loss": 0.0376, "step": 12772 }, { "epoch": 0.994443092793538, "grad_norm": 0.35378111071827223, "learning_rate": 3.1402238898836644e-09, "loss": 0.0183, "step": 12773 }, { "epoch": 0.9945209478857476, "grad_norm": 0.3560519071222394, "learning_rate": 3.05239204986707e-09, "loss": 0.0299, "step": 12774 }, { "epoch": 0.9945988029779573, "grad_norm": 0.3677977455737459, "learning_rate": 2.965805925692067e-09, "loss": 0.0263, "step": 12775 }, { "epoch": 0.9946766580701669, "grad_norm": 0.4137196605165094, "learning_rate": 2.880465522754339e-09, "loss": 0.0406, "step": 12776 }, { "epoch": 0.9947545131623765, "grad_norm": 0.357320656818405, "learning_rate": 2.796370846369634e-09, "loss": 0.0252, "step": 12777 }, { "epoch": 0.9948323682545861, "grad_norm": 0.3741999287226739, "learning_rate": 2.7135219017737636e-09, "loss": 0.0285, "step": 12778 }, { "epoch": 0.9949102233467958, "grad_norm": 0.4916143458746127, "learning_rate": 2.6319186941314854e-09, "loss": 0.0414, "step": 12779 }, { "epoch": 0.9949880784390054, "grad_norm": 0.370443737925761, "learning_rate": 2.5515612285231804e-09, "loss": 0.0325, "step": 12780 }, { "epoch": 0.995065933531215, "grad_norm": 0.342113716879339, "learning_rate": 2.4724495099581746e-09, "loss": 0.0194, "step": 12781 }, { "epoch": 0.9951437886234247, "grad_norm": 0.5191429162661639, "learning_rate": 2.3945835433614174e-09, "loss": 0.0562, "step": 12782 }, { "epoch": 0.9952216437156343, "grad_norm": 0.559797288713822, "learning_rate": 2.317963333584583e-09, "loss": 0.0696, "step": 12783 }, { "epoch": 0.9952994988078439, "grad_norm": 0.3535338592202836, "learning_rate": 2.242588885403851e-09, "loss": 0.0258, "step": 12784 }, { "epoch": 0.9953773539000536, "grad_norm": 0.35869079969975665, "learning_rate": 2.1684602035088045e-09, "loss": 0.0264, "step": 12785 }, { "epoch": 0.9954552089922631, "grad_norm": 0.4578840370623148, "learning_rate": 2.0955772925224104e-09, "loss": 0.0429, "step": 12786 }, { "epoch": 0.9955330640844727, "grad_norm": 0.5081738007608204, "learning_rate": 2.0239401569832618e-09, "loss": 0.049, "step": 12787 }, { "epoch": 0.9956109191766824, "grad_norm": 0.41386059463004476, "learning_rate": 1.953548801352234e-09, "loss": 0.0321, "step": 12788 }, { "epoch": 0.995688774268892, "grad_norm": 0.48537914685245176, "learning_rate": 1.8844032300191496e-09, "loss": 0.0468, "step": 12789 }, { "epoch": 0.9957666293611016, "grad_norm": 0.4358937180338997, "learning_rate": 1.8165034472850118e-09, "loss": 0.0372, "step": 12790 }, { "epoch": 0.9958444844533113, "grad_norm": 0.3837646084204684, "learning_rate": 1.7498494573842117e-09, "loss": 0.0264, "step": 12791 }, { "epoch": 0.9959223395455209, "grad_norm": 0.3543303198389123, "learning_rate": 1.6844412644689833e-09, "loss": 0.019, "step": 12792 }, { "epoch": 0.9960001946377305, "grad_norm": 0.5435205065690514, "learning_rate": 1.6202788726116247e-09, "loss": 0.0515, "step": 12793 }, { "epoch": 0.9960780497299402, "grad_norm": 0.2797431170900324, "learning_rate": 1.5573622858089388e-09, "loss": 0.0119, "step": 12794 }, { "epoch": 0.9961559048221498, "grad_norm": 0.4212095266811741, "learning_rate": 1.4956915079822332e-09, "loss": 0.0364, "step": 12795 }, { "epoch": 0.9962337599143594, "grad_norm": 0.33311872330619513, "learning_rate": 1.4352665429728795e-09, "loss": 0.0224, "step": 12796 }, { "epoch": 0.9963116150065691, "grad_norm": 0.4713344856503949, "learning_rate": 1.3760873945467545e-09, "loss": 0.0389, "step": 12797 }, { "epoch": 0.9963894700987787, "grad_norm": 0.38525389766001644, "learning_rate": 1.3181540663853576e-09, "loss": 0.0293, "step": 12798 }, { "epoch": 0.9964673251909882, "grad_norm": 0.3300825720038131, "learning_rate": 1.2614665621035749e-09, "loss": 0.0169, "step": 12799 }, { "epoch": 0.9965451802831979, "grad_norm": 0.46529220952353495, "learning_rate": 1.2060248852296951e-09, "loss": 0.0428, "step": 12800 }, { "epoch": 0.9965451802831979, "eval_loss": 0.004746290855109692, "eval_runtime": 162.2328, "eval_samples_per_second": 17.752, "eval_steps_per_second": 0.635, "step": 12800 }, { "epoch": 0.9966230353754075, "grad_norm": 0.42145874302778974, "learning_rate": 1.1518290392165121e-09, "loss": 0.0301, "step": 12801 }, { "epoch": 0.9967008904676171, "grad_norm": 0.3599817288436605, "learning_rate": 1.098879027441324e-09, "loss": 0.0227, "step": 12802 }, { "epoch": 0.9967787455598268, "grad_norm": 0.4027554823025903, "learning_rate": 1.047174853203714e-09, "loss": 0.0256, "step": 12803 }, { "epoch": 0.9968566006520364, "grad_norm": 0.3639686274836572, "learning_rate": 9.967165197233286e-10, "loss": 0.0296, "step": 12804 }, { "epoch": 0.996934455744246, "grad_norm": 0.5202114947539622, "learning_rate": 9.475040301443195e-10, "loss": 0.0592, "step": 12805 }, { "epoch": 0.9970123108364557, "grad_norm": 0.5165736414241526, "learning_rate": 8.995373875309021e-10, "loss": 0.0385, "step": 12806 }, { "epoch": 0.9970901659286653, "grad_norm": 0.2422225560869717, "learning_rate": 8.528165948740175e-10, "loss": 0.009, "step": 12807 }, { "epoch": 0.9971680210208749, "grad_norm": 0.47359554172877205, "learning_rate": 8.073416550802293e-10, "loss": 0.045, "step": 12808 }, { "epoch": 0.9972458761130846, "grad_norm": 0.3490986835514063, "learning_rate": 7.631125709872678e-10, "loss": 0.0208, "step": 12809 }, { "epoch": 0.9973237312052942, "grad_norm": 0.39446058003567563, "learning_rate": 7.201293453462654e-10, "loss": 0.0253, "step": 12810 }, { "epoch": 0.9974015862975038, "grad_norm": 0.4132424298666358, "learning_rate": 6.783919808350803e-10, "loss": 0.032, "step": 12811 }, { "epoch": 0.9974794413897135, "grad_norm": 0.3903618589589503, "learning_rate": 6.379004800560751e-10, "loss": 0.0239, "step": 12812 }, { "epoch": 0.997557296481923, "grad_norm": 0.3104611230604051, "learning_rate": 5.986548455316765e-10, "loss": 0.0156, "step": 12813 }, { "epoch": 0.9976351515741326, "grad_norm": 0.41226406365774876, "learning_rate": 5.606550797043753e-10, "loss": 0.0299, "step": 12814 }, { "epoch": 0.9977130066663422, "grad_norm": 0.4850714193166748, "learning_rate": 5.239011849411668e-10, "loss": 0.0458, "step": 12815 }, { "epoch": 0.9977908617585519, "grad_norm": 0.5196650579664785, "learning_rate": 4.88393163535772e-10, "loss": 0.0501, "step": 12816 }, { "epoch": 0.9978687168507615, "grad_norm": 0.4722994582626924, "learning_rate": 4.54131017695314e-10, "loss": 0.0366, "step": 12817 }, { "epoch": 0.9979465719429711, "grad_norm": 0.4593000431344998, "learning_rate": 4.211147495580825e-10, "loss": 0.045, "step": 12818 }, { "epoch": 0.9980244270351808, "grad_norm": 0.3622402212878514, "learning_rate": 3.8934436117799014e-10, "loss": 0.0257, "step": 12819 }, { "epoch": 0.9981022821273904, "grad_norm": 0.28146025391193347, "learning_rate": 3.588198545356747e-10, "loss": 0.0158, "step": 12820 }, { "epoch": 0.9981801372196, "grad_norm": 0.4608419673567097, "learning_rate": 3.29541231531838e-10, "loss": 0.0397, "step": 12821 }, { "epoch": 0.9982579923118097, "grad_norm": 0.4692909399630921, "learning_rate": 3.015084939916868e-10, "loss": 0.0517, "step": 12822 }, { "epoch": 0.9983358474040193, "grad_norm": 0.4058338291474981, "learning_rate": 2.747216436604916e-10, "loss": 0.027, "step": 12823 }, { "epoch": 0.9984137024962288, "grad_norm": 0.42198216118627313, "learning_rate": 2.491806822058074e-10, "loss": 0.0303, "step": 12824 }, { "epoch": 0.9984915575884385, "grad_norm": 0.4436129603397557, "learning_rate": 2.2488561122191443e-10, "loss": 0.0338, "step": 12825 }, { "epoch": 0.9985694126806481, "grad_norm": 0.4758257118664694, "learning_rate": 2.0183643221871607e-10, "loss": 0.0448, "step": 12826 }, { "epoch": 0.9986472677728577, "grad_norm": 0.47690049878414476, "learning_rate": 1.8003314663506134e-10, "loss": 0.0422, "step": 12827 }, { "epoch": 0.9987251228650674, "grad_norm": 0.4264357892533392, "learning_rate": 1.5947575582764274e-10, "loss": 0.0391, "step": 12828 }, { "epoch": 0.998802977957277, "grad_norm": 0.4268092002256422, "learning_rate": 1.4016426107543723e-10, "loss": 0.0343, "step": 12829 }, { "epoch": 0.9988808330494866, "grad_norm": 0.49799948299962765, "learning_rate": 1.2209866358636746e-10, "loss": 0.0526, "step": 12830 }, { "epoch": 0.9989586881416963, "grad_norm": 0.43820157966558115, "learning_rate": 1.0527896448175867e-10, "loss": 0.0339, "step": 12831 }, { "epoch": 0.9990365432339059, "grad_norm": 0.554062294514454, "learning_rate": 8.97051648096614e-11, "loss": 0.0641, "step": 12832 }, { "epoch": 0.9991143983261155, "grad_norm": 0.5638268326826249, "learning_rate": 7.537726554041058e-11, "loss": 0.0581, "step": 12833 }, { "epoch": 0.9991922534183252, "grad_norm": 0.38568504019652255, "learning_rate": 6.229526756884596e-11, "loss": 0.0334, "step": 12834 }, { "epoch": 0.9992701085105348, "grad_norm": 0.44095429745389775, "learning_rate": 5.04591717076508e-11, "loss": 0.0371, "step": 12835 }, { "epoch": 0.9993479636027444, "grad_norm": 0.533364098803477, "learning_rate": 3.986897869623363e-11, "loss": 0.0478, "step": 12836 }, { "epoch": 0.9994258186949541, "grad_norm": 0.513531336853202, "learning_rate": 3.052468918962603e-11, "loss": 0.0484, "step": 12837 }, { "epoch": 0.9995036737871636, "grad_norm": 0.39299549326380473, "learning_rate": 2.2426303776246215e-11, "loss": 0.0323, "step": 12838 }, { "epoch": 0.9995815288793732, "grad_norm": 0.3418985194072482, "learning_rate": 1.557382295569454e-11, "loss": 0.0174, "step": 12839 }, { "epoch": 0.9996593839715829, "grad_norm": 0.39307442131861614, "learning_rate": 9.96724715651709e-12, "loss": 0.0306, "step": 12840 }, { "epoch": 0.9997372390637925, "grad_norm": 0.296769432896363, "learning_rate": 5.606576729544344e-12, "loss": 0.0175, "step": 12841 }, { "epoch": 0.9998150941560021, "grad_norm": 0.4438234933055643, "learning_rate": 2.4918119456707192e-12, "loss": 0.0365, "step": 12842 }, { "epoch": 0.9998929492482118, "grad_norm": 0.39731985669910447, "learning_rate": 6.229529958545755e-13, "loss": 0.0226, "step": 12843 }, { "epoch": 0.9999708043404214, "grad_norm": 0.3502181628060406, "learning_rate": 0.0, "loss": 0.0205, "step": 12844 } ], "logging_steps": 1, "max_steps": 12844, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.658116305171251e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }